1 Dependencies and settings

# R packages
library("corrr") # 0.4.2
library("cowplot") # 1.0.0
library("dplyr") # 1.0.0
library("DT") #0.14
library("ggplot2") # 3.3.1
library("glmnet") # 4.0-2
library("oem") # 2.0.10
library("gridExtra") # 2.3
library("knitr") # 1.28
library("kableExtra") # 1.1.0
library("magrittr") # 1.5
library("mice") # 3.9.0
library("pROC") # 1.16.2
library("purrr") # 0.3.4
library("rcompanion") # 2.3.25
library("rms") # 6.0.1
library("splines") # base
library("tableone") # 12.0.0
library("tibble") # 3.0.1
library("tidyr") # 1.1.0

# Namespace clash
select <- dplyr::select

# Settings
set.seed(123)
theme_set(theme_bw())
n_imputations <- 5    # Number of multiple imputations with MICE
n_boot_val <- 40      # Number of bootstraps with rms validation and calibration
n_boot_probs <- 100   # Number of bootstraps for computing predicted probabilitiy CIs
n_rep <- 20           # Number of repeats with grouped lasso
CACHE <- TRUE         # Whether to use cached results for long computations

2 Data

We begin by loading the “training” dataset and restrict to (i) patients age 18 and older and (ii) with an index date more than 2 weeks prior to the data release. Note that we are not unfortunately allowed to publicly share this data.

2.1 Load

filter_ie <- function(data){
  data1 <- data %>%
    filter(age >= 18) 
  data2 <- data1 %>%
    filter(as.Date("2020-06-05") - index_date > 14)
  n_dropped <- nrow(data1) - nrow(data2)
  percent_dropped <- formatC(100 * n_dropped/nrow(data1), 
                             format = "f", digits = 2)
  message(n_dropped, " (", percent_dropped, "%)",
          " patients were dropped due to the 2-week cutoff.")
  return(data2)
}

train_data <- readRDS("train_data.rds")  %>%
  filter_ie()
## 750 (5.21%) patients were dropped due to the 2-week cutoff.

2.2 Clean

# Function to add race/ethnicity variable to dataset (done after separately 
# imputing missing race + ethnicity information)
add_race_ethnicity <- function(data){
  data %>% mutate(
    race_ethnicity = case_when(
      race == "Caucasian" & ethnicity != "Hispanic" ~ "Non-Hispanic white",
      race == "African American" & ethnicity != "Hispanic" ~ "Non-Hispanic black",
      race == "Asian" & ethnicity != "Hispanic" ~ "Asian",
      is.na(race) | is.na(ethnicity) ~ NA_character_,
      TRUE ~ "Hispanic"
    ),
    race_ethnicity = relevel(factor(race_ethnicity), ref = "Non-Hispanic white")
  )
}
clean_data <- function(data){
  # Recoding
data <- data %>% 
  mutate(
    ## "died" in add_deaths() should be an integer
    died = as.integer(died),
    
    ## Convert unknown or other/unknown to missing
    race = ifelse(race == "Other/Unknown", NA, race),
    ethnicity = ifelse(ethnicity == "Unknown", NA, ethnicity),
    sex = ifelse(sex == "Unknown", NA, sex),
    
    ## Oxygen saturation should have plausible values
    spo2 = ifelse(spo2 == 0, NA_real_, spo2),
    spo2 = ifelse(spo2 > 100, 100, spo2),
    
    # # Division
    ## Set "Other" region to missing since all 9 geographic regions are in the data
    division = ifelse(division == "Other", NA, division),
    
    ## Move small categories to other
    division = ifelse(division %in% c("East South Central", "Mountain"),
                      "Other", 
                      division)
  ) %>%
  
  ## Better names for some variables
  rename(diabunc = diab, cci = score) %>%
  
  ## CCI should be an integer
  mutate(cci = as.integer(cci)) %>%
  
  ## Convert comorbidities to character
  mutate_at(
    c("ami", "chf", "pvd", "cevd", "dementia", "cpd", "rheumd", "pud",
      "mld", "diabunc", "diabwc", "hp", "rend", "canc", "msld", "metacanc",
      "aids", "hypc", "hypunc"),
    function (x) ifelse(x == 1, "Yes", "No")
  ) 

# Create "derived" variables
data <- data %>%
  mutate(
    calendar_time = as.numeric(index_date - min(index_date)),
    index_month = as.integer(format(index_date, "%m")),
    death_month = as.integer(format(date_of_death,"%m")),
    os_days = pmax(0, date_of_death - index_date),
    
    ## Categorize BMI
    bmi_cat = case_when(
      bmi < 18.5 ~ "Underweight",
      bmi >= 18.5 & bmi < 25 ~ "Normal",
      bmi >= 25 & bmi < 30 ~ "Overweight",
      bmi >= 30 ~ "Obese",
      TRUE ~ NA_character_
    ),
    
    ## Combine comorbidities
    diab = case_when(
      diabunc == "Yes" | diabwc == "Yes" ~ "Yes",
      TRUE ~ "No"
    ),
    hyp = case_when(
      hypc == "Yes" | hypunc == "Yes" ~ "Yes",
      TRUE ~ "No"
    )
  ) %>%
  
  ## Create race/ethnicity variable
  ## Later to be created after imputation from imputed race + ethnicity
  add_race_ethnicity() 

  # Return
  return(data)
}

train_data <- clean_data(train_data)

2.3 Label

# Labels
## Categorical variables
demographic_cat_vars <- tribble(
  ~var, ~varlab,
  "sex", "Sex",
  "race", "Race",
  "ethnicity", "Ethnicity",
  "division", "Geographic division",
  "smoke", "Smoking",
  "race_ethnicity", 'Race/Ethnicity'
) %>% 
  mutate(group = "Demographics")

comorbidity_cat_vars <- tribble(
  ~var, ~varlab,
  "ami", "Acute myocardial infarction",
  "chf", "Congestive heart failure",
  "pvd", "Peripheral vascular disease",
  "cevd", "Cerebrovascular disease",
  "dementia", "Dementia",
  "cpd", "Chronic pulmonary disease",
  "rheumd", "Rheumatoid disease",
  "pud", "Peptic ulcer disease",
  "mld", "Mild liver disease",
  "diabunc", "Diabetes (no complications)",
  "diabwc", "Diabetes (complications)",
  "hp", "Hemiplegia or paraplegia",
  "rend", "Renal disease",
  "canc", "Cancer",
  "msld", "Moderate/severe liver disease",
  "metacanc", "Metastatic cancer",
  "aids", "AIDS/HIV",
  "hypunc", "Hypertension (no complications)",
  "hypc", "Hypertension (complications)",
  "diab", "Diabetes", # Combines diabunc and diabwc
  "hyp", "Hypertension", # Combine hypunc and hypc
) %>%
  mutate(group = "Comorbidities")

vital_cat_vars <- tribble(
  ~var, ~varlab,
  "bmi_cat", "Body Mass Index (BMI)",
) %>%
  mutate(group = "Vitals")

cat_vars <- bind_rows(demographic_cat_vars,
                      comorbidity_cat_vars,
                      vital_cat_vars)

## Continuous variables
demographic_continuous_vars <- tribble(
  ~var, ~varlab,
  "age", "Age",
  "calendar_time", "Calendar time"
) %>%
  mutate(group = "Demographics")

comorbidity_continuous_vars <- tribble(
  ~var, ~varlab,
  "cci", "CCI",
) %>%
  mutate(group = "Comorbidities")

vital_continuous_vars <- tribble(
  ~var, ~varlab,
  "bmi", "Body Mass Index (BMI)",
  "dbp", "Diastolic blood pressure",
  "sbp", "Systolic blood pressure",
  "hr", "Heart rate",
  "resp", "Respiration rate",
  "spo2", "Oxygen saturation",
  "temp", "Temperature",
) %>%
  mutate(group = "Vitals")

lab_vars <- tribble(
  ~var, ~varlab,
  "alt", "Alanine aminotransferase (ALT)",
  "ast", "Aspartate aminotransferase (AST)",
  "crp", "C-reactive protein (CRP)",
  "creatinine", "Creatinine",
  "ferritin", "Ferritin",
  "d_dimer", "Fibrin D-Dimer",
  "ldh", "Lactate dehydrogenase (LDH)",
  "lymphocyte", "Lymphocyte count",
  "neutrophil", "Neutrophil count",
  "pct", "Procalcitonin",
  "tni", "Troponin I",
  "plt", "Platelet count (PLT)",
  "wbc", "White blood cell count (WBC)"
) %>%
  mutate(group = "Labs")

continuous_vars <- bind_rows(
  demographic_continuous_vars,
  comorbidity_continuous_vars,
  vital_continuous_vars,
  lab_vars
)

# All variables
vars <- bind_rows(cat_vars,
                  continuous_vars)

get_var_labs <- function(v){
  vars$varlab[match(v, vars$var)]
}

3 Preliminary data analysis

Before starting modeling, it’s a good idea to carefully inspect the data.

3.1 Sample size

train_data %>%
  count(name = "Sample size") %>% 
  mutate(Data = "Optum training set") %>%
  select(Data, `Sample size`) %>%
  kable() %>%
  kable_styling()
Data Sample size
Optum training set 13658

3.2 Missing data

missing_df <- train_data %>%
  select(one_of(vars$var)) %>%
  mutate_all(function (x) ifelse(is.na(x), 1, 0)) %>%
  mutate(id = factor(1:n())) %>%
  pivot_longer(cols = vars$var, names_to = "var", values_to = "missing") %>%
  left_join(vars, by = "var")

3.2.1 Proportion missing

# Compute proportion missing
prop_missing <- missing_df %>%
  group_by(varlab) %>%
  summarise(prop = mean(missing))
## `summarise()` ungrouping output (override with `.groups` argument)
# Plot
ggplot(prop_missing, aes(x = varlab, y = prop)) +
  geom_bar(stat = "identity") + 
  geom_text(aes(label = formatC(prop, format = "f", digits = 2)),
          nudge_y = .03, size = 3) +
ylim(c(0, 1)) +
xlab("") +
ylab("Proportion") +
coord_flip() +
scale_x_discrete(limits = rev(sort(vars$varlab)))

3.2.2 Missing by patient

ggplot(missing_df,
       aes(x = id, y = varlab,  fill = factor(missing))) +
  geom_raster() + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_blank(),
        legend.position = "bottom") +
  scale_fill_manual(name = "Missing",
                    values = c("lightgrey", "steelblue"),
                    labels = c("No", "Yes")) +
  scale_y_discrete(limits = rev(sort(vars$varlab)))
## Warning: Raster pixels are placed at uneven vertical intervals and will be shifted. Consider using geom_tile() instead.

missing_df %>% 
  # Count of missing by patient
  group_by(id) %>%
  summarise(n_missing = sum(missing),
            prop_missing = n_missing/n()) %>%
  
  # Plot
 ggplot(aes(x = prop_missing)) +
  geom_histogram(binwidth = .03, color = "white") +
  scale_x_continuous(breaks = seq(0, 1, .05)) +
  scale_y_continuous(n.breaks = 20) +
  xlab("Proportion of covariates that are missing") +
  ylab("Count") 
## `summarise()` ungrouping output (override with `.groups` argument)

3.3 Distributions

3.3.1 Covariates

3.3.1.1 Categorical

cat_var_df <-  train_data %>%
  select(one_of("ptid", cat_vars$var)) %>%
  pivot_longer(cols = cat_vars$var, names_to = "var", values_to = "value") %>%
  left_join(cat_vars, by = "var") %>%
  filter(!is.na(value)) %>%
  group_by(var, varlab, value) %>%
  summarise(n = n()) %>%
  group_by(varlab) %>%
  mutate(freq = n / sum(n)) %>%
  ungroup() %>%
  mutate(
    nudge_x = case_when(
      freq < 0.5 ~ 0.15,
      TRUE ~ -0.15
    )
  )
## `summarise()` regrouping output by 'var', 'varlab' (override with `.groups` argument)
ggplot(cat_var_df,
       aes(x = freq, y = value)) +
  geom_point() + 
  geom_text(aes(label = formatC(freq, format = "f", digits = 2)),
            nudge_x = cat_var_df$nudge_x, size = 3.5) +
  facet_wrap(~varlab, scales = "free_y", ncol = 4) +
  xlim(0, 1) +
  xlab("Proportion") +
  ylab("") +
  theme(axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        strip.text.x = element_text(size = 7))

3.3.1.2 Continuous

3.3.1.2.1 Box plot
pivot_continuous_longer <- function(data, vars){
  col_names <- vars$var
  train_data %>%
    select(one_of("ptid", col_names)) %>%
    pivot_longer(cols = col_names, 
                 names_to = "var", 
                 values_to = "value") %>%
    left_join(vars, by = "var") %>%
    filter(!is.na(value)) 
}
continuous_var_df <- pivot_continuous_longer(train_data, 
                                             vars = continuous_vars)

plot_box <- function(data){
  ggplot(data, 
       aes(x = varlab, y = value)) +
  geom_boxplot(outlier.size = 1) + 
  facet_wrap(~varlab, scales = "free") +
  xlab("") +
  ylab("Value") +
   theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        strip.text = element_text(size = 7))
}
plot_box(continuous_var_df)

3.3.1.2.2 Histogram
plot_hist <- function(data){
  ggplot(data,
       aes(x = value)) +
  geom_histogram(bins = 40, color = "white") +
  facet_wrap(~varlab, scales = "free", ncol = 4) + 
  xlab("") + ylab("Frequency") +
  theme(strip.text = element_text(size = 7))
}
plot_hist(continuous_var_df)

3.3.1.2.3 Outliers

Visual inspection of the box plots and histograms suggests that there are significant outliers in the labs. Let’s look at how many observations lie above the 99th percentile of the data and the “outer fence” (defined as the 3rd quartile plus 3 time the interquartile range). We will then create new lab variables truncated from above at the outer fence and replot the histograms.

outer_fence <- function(v){
  q1 <- quantile(v, .25, na.rm = TRUE)
  q3 <- quantile(v, .75, na.rm = TRUE)
  iq <- (q3 - q1)
  return(as.numeric(q3 + 3 * iq))
}

format_percent <- function(x){
  paste0(formatC(100 * x, format = "f", digits = 1), "%")
}

train_data %>%
  select(one_of(lab_vars$var)) %>%
  pivot_longer(cols = lab_vars$var, names_to = "Lab") %>%
  filter(!is.na(value)) %>%
  group_by(Lab) %>%
  summarise(Maximum = max(value),
            `99%` = quantile(value, .99), 
            `Outer fence` = outer_fence(value),
            `% above outer fence` = format_percent(mean(value > outer_fence(value)))) %>%
  mutate(Lab = get_var_labs(Lab)) %>%
  kable() %>%
  kable_styling()
## `summarise()` ungrouping output (override with `.groups` argument)
Lab Maximum 99% Outer fence % above outer fence
Alanine aminotransferase (ALT) 3017.00 224.700000 130.00 3.4%
Aspartate aminotransferase (AST) 7000.01 336.000000 157.00 3.6%
Creatinine 27.35 10.880500 3.24 6.4%
C-reactive protein (CRP) 638.00 359.030000 458.00 0.2%
Fibrin D-Dimer 114475.00 19499.300000 4993.00 7.0%
Ferritin 100000.01 7746.620000 3648.00 3.4%
Lactate dehydrogenase (LDH) 14007.00 1262.440000 1050.00 1.9%
Lymphocyte count 120.35 3.740000 3.50 1.2%
Neutrophil count 82.40 18.900000 18.29 1.2%
Procalcitonin 753.66 30.122600 1.27 10.2%
Platelet count (PLT) 1213.00 527.860000 569.00 0.7%
Troponin I 95.40 2.190225 0.17 8.1%
White blood cell count (WBC) 127.50 22.600000 21.70 1.2%
# Truncate labs using outer fence
truncate_max <- function(v) outer_fence(v)

add_truncated_lab_vars <- function(data, v){
  for (i in 1:length(v)){ # Start loop over labs
    original_var <- v[i]
    truncated_var <- paste0(original_var, "_t")
    truncated_max_i <- truncate_max(data[[original_var]])
    data <- data %>% mutate(
      !!truncated_var := ifelse(get(original_var) > truncated_max_i, 
                                truncated_max_i, 
                                get(original_var))
    )
  } # End loop over labs
  return(data)
}
train_data <- add_truncated_lab_vars(train_data, v = lab_vars$var)

After truncating the labs, the probability distributions appear more reasonable.

lab_vars_t <- lab_vars %>%
  mutate(var = paste0(var, "_t"))
vars <- bind_rows(vars, lab_vars_t) 

continuous_vars_t <- bind_rows(
  demographic_continuous_vars,
  vital_continuous_vars,
  lab_vars_t
)

continuous_var_t_df <- pivot_continuous_longer(train_data, 
                                               vars = continuous_vars_t)
plot_hist(continuous_var_t_df)

3.3.2 Outcomes

train_data %>%
  count(died) %>%
  mutate(died = ifelse(died == 1, "Yes", "No"),
         prop = n/sum(n)) %>%
  ggplot(aes(x = died, y = prop)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = formatC(prop, format = "f", digits = 2)),
            nudge_y = .03, size = 3) +
  xlab("Died") +
  ylab("Proportion") 

3.3.2.1 Time to death

train_data %>%
  mutate(months_to_death = death_month - index_month) %>%
  filter(!is.na(months_to_death)) %>%
  group_by(months_to_death) %>%
  tally() %>%
  
  # Plot
  ggplot(aes(x = factor(months_to_death), y = n)) +
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Months from index date to death") +
  ylab("Count")

3.4 Bivariate relationships

3.4.1 Age and comorbidities

We start with the Charlson Comorbidity Index (CCI).

ggplot(train_data, aes_string(x = "age", y = "cci")) + 
  geom_point() +
  geom_smooth() +
  xlab(get_var_labs("age")) +
  ylab(get_var_labs("cci"))

Next, we will examine each comorbidity separately by plotting the probability that a patient has each comorbidity as a function of their age.

train_data[, c("age", comorbidity_cat_vars$var)] %>%
  pivot_longer(cols = comorbidity_cat_vars$var,
               names_to = "comorbidity") %>%
  mutate(value = ifelse(value == "No", 0L, 1L),
         comorbidity = get_var_labs(comorbidity)) %>%
  
  # Make plot
  ggplot(aes(x = age, y = value)) + 
    geom_smooth() +
    facet_wrap(~comorbidity, ncol = 4) +
   xlab(get_var_labs("age")) +
   ylab("Probability")

3.4.2 Age and vitals

plot_scatter_continuous <- function(x_var, y_vars){
  train_data[, c(x_var, y_vars)] %>%
    pivot_longer(cols = all_of(y_vars)) %>%
    mutate(name = get_var_labs(name)) %>%
  
  # Make plot
  ggplot(aes_string(x = x_var, y = "value")) + 
    geom_smooth(se = FALSE) +
    facet_wrap(~name, ncol = 4, scales = "free_y") +
    xlab(get_var_labs(x_var)) +
    ylab("Value")
}

plot_scatter_continuous("age", vital_continuous_vars$var)
## Warning: Removed 3863 rows containing non-finite values (stat_smooth).

3.4.3 Age and labs

plot_scatter_continuous("age", lab_vars_t$var) +
   theme(strip.text.x = element_text(size = 7))
## Warning: Removed 55407 rows containing non-finite values (stat_smooth).

3.4.4 Comorbidities and labs/vitals

plot_scatter_continuous("cci", c(vital_continuous_vars$var, lab_vars_t$var)) +
  scale_x_continuous(breaks = sort(unique(train_data$cci))) +
  theme(strip.text.x = element_text(size = 7))
## Warning: Removed 59270 rows containing non-finite values (stat_smooth).

3.5 Table 1

# Select variables for table 1
varnames_to_remove <- c("race", "ethnicity", "bmi_cat", "hypunc", "hypc", 
                        "diabunc", "diabwc")

tbl1_varnames <- bind_rows(
  demographic_continuous_vars,
  demographic_cat_vars %>% arrange(varlab),
  comorbidity_cat_vars %>% arrange(varlab),
  comorbidity_continuous_vars,
  vital_continuous_vars %>% arrange(varlab),
  lab_vars %>% arrange(varlab)
) %>%
  filter(!var %in% varnames_to_remove) %>%
  pull(var)

tbl1_cat_varnames <- bind_rows(
  demographic_cat_vars,
  comorbidity_cat_vars,
  vital_cat_vars
) %>%
  filter(!var %in% varnames_to_remove) %>%
  pull(var)

tbl1_non_normal_varnames <- c(
  demographic_continuous_vars$var,
  vital_continuous_vars$var,
  comorbidity_continuous_vars$var,
  lab_vars$var
)
  
## Create a TableOne object
tbl1_train <- train_data %>%
  select(one_of(tbl1_varnames, "died")) %>%
  rename_with(get_var_labs, .cols = all_of(tbl1_varnames)) %>%
  rename(Survivor = died) %>%
  CreateTableOne(vars = get_var_labs(tbl1_varnames), 
                 factorVars = get_var_labs(tbl1_cat_varnames), 
                 strata = "Survivor",  addOverall = TRUE, 
                 data = .)

## Print table 1
print(tbl1_train, nonnormal = get_var_labs(tbl1_non_normal_varnames),  
      cramVars = c("Sex"),
      contDigits = 1, missing = TRUE, printToggle = FALSE) %>%
  set_colnames(c("Overall", "Survivor", "Non-survivor",
                "p", "Test", "Missing")) %>%
  kable() %>%
  kable_styling()
Overall Survivor Non-survivor p Test Missing
n 13658 11495 2163
Age (median [IQR]) 62.0 [49.0, 75.0] 59.0 [46.0, 71.0] 77.0 [67.0, 85.0] <0.001 nonnorm 0.0
Calendar time (median [IQR]) 47.0 [38.0, 64.0] 47.0 [38.0, 64.0] 46.0 [37.0, 60.0] <0.001 nonnorm 0.0
Geographic division (%) <0.001 2.7
East North Central 4627 (34.8) 3954 (35.3) 673 (31.9)
Middle Atlantic 4636 (34.9) 3844 (34.4) 792 (37.6)
New England 1583 (11.9) 1272 (11.4) 311 (14.8)
Other 191 ( 1.4) 175 ( 1.6) 16 ( 0.8)
Pacific 511 ( 3.8) 438 ( 3.9) 73 ( 3.5)
South Atl/West South Crl 364 ( 2.7) 317 ( 2.8) 47 ( 2.2)
West North Central 1384 (10.4) 1189 (10.6) 195 ( 9.3)
Race/Ethnicity (%) <0.001 26.1
Non-Hispanic white 5647 (56.0) 4455 (53.1) 1192 (70.3)
Asian 362 ( 3.6) 307 ( 3.7) 55 ( 3.2)
Hispanic 533 ( 5.3) 478 ( 5.7) 55 ( 3.2)
Non-Hispanic black 3547 (35.2) 3153 (37.6) 394 (23.2)
Sex = Female/Male (%) 6563/7091 (48.1/51.9) 5635/5856 (49.0/51.0) 928/1235 (42.9/57.1) <0.001 0.0
Smoking (%) <0.001 25.6
Current 866 ( 8.5) 785 ( 9.0) 81 ( 5.5)
Never 6207 (61.1) 5450 (62.8) 757 (51.1)
Previous 3092 (30.4) 2450 (28.2) 642 (43.4)
Acute myocardial infarction = Yes (%) 1535 (11.2) 1028 ( 8.9) 507 (23.4) <0.001 0.0
AIDS/HIV = Yes (%) 101 ( 0.7) 89 ( 0.8) 12 ( 0.6) 0.339 0.0
Cancer = Yes (%) 1678 (12.3) 1282 (11.2) 396 (18.3) <0.001 0.0
Cerebrovascular disease = Yes (%) 1439 (10.5) 1023 ( 8.9) 416 (19.2) <0.001 0.0
Chronic pulmonary disease = Yes (%) 3627 (26.6) 2933 (25.5) 694 (32.1) <0.001 0.0
Congestive heart failure = Yes (%) 2325 (17.0) 1604 (14.0) 721 (33.3) <0.001 0.0
Dementia = Yes (%) 1394 (10.2) 854 ( 7.4) 540 (25.0) <0.001 0.0
Diabetes = Yes (%) 4612 (33.8) 3669 (31.9) 943 (43.6) <0.001 0.0
Hemiplegia or paraplegia = Yes (%) 330 ( 2.4) 228 ( 2.0) 102 ( 4.7) <0.001 0.0
Hypertension = Yes (%) 8003 (58.6) 6333 (55.1) 1670 (77.2) <0.001 0.0
Metastatic cancer = Yes (%) 277 ( 2.0) 188 ( 1.6) 89 ( 4.1) <0.001 0.0
Mild liver disease = Yes (%) 879 ( 6.4) 711 ( 6.2) 168 ( 7.8) 0.007 0.0
Moderate/severe liver disease = Yes (%) 128 ( 0.9) 88 ( 0.8) 40 ( 1.8) <0.001 0.0
Peptic ulcer disease = Yes (%) 206 ( 1.5) 160 ( 1.4) 46 ( 2.1) 0.013 0.0
Peripheral vascular disease = Yes (%) 1671 (12.2) 1176 (10.2) 495 (22.9) <0.001 0.0
Renal disease = Yes (%) 2833 (20.7) 1984 (17.3) 849 (39.3) <0.001 0.0
Rheumatoid disease = Yes (%) 398 ( 2.9) 315 ( 2.7) 83 ( 3.8) 0.007 0.0
CCI (median [IQR]) 1.0 [0.0, 3.0] 1.0 [0.0, 2.0] 3.0 [1.0, 5.0] <0.001 nonnorm 0.0
Body Mass Index (BMI) (median [IQR]) 29.7 [25.5, 35.1] 30.0 [25.8, 35.4] 28.1 [24.0, 33.5] <0.001 nonnorm 11.9
Diastolic blood pressure (median [IQR]) 73.0 [65.5, 80.5] 74.0 [66.5, 81.0] 68.0 [60.0, 75.5] <0.001 nonnorm 3.1
Heart rate (median [IQR]) 87.5 [77.5, 98.0] 87.0 [77.5, 98.0] 89.0 [77.5, 102.0] <0.001 nonnorm 3.1
Oxygen saturation (median [IQR]) 96.0 [94.0, 98.0] 96.0 [94.5, 98.0] 95.0 [93.0, 97.0] <0.001 nonnorm 3.9
Respiration rate (median [IQR]) 20.0 [18.0, 22.0] 19.5 [18.0, 21.0] 22.0 [19.0, 26.0] <0.001 nonnorm 3.9
Systolic blood pressure (median [IQR]) 126.0 [115.0, 139.0] 127.0 [116.0, 139.0] 122.0 [109.0, 136.5] <0.001 nonnorm 3.2
Temperature (median [IQR]) 37.0 [36.7, 37.4] 37.0 [36.7, 37.4] 37.1 [36.7, 37.6] <0.001 nonnorm 3.1
Alanine aminotransferase (ALT) (median [IQR]) 28.0 [18.0, 46.0] 28.0 [18.0, 46.0] 27.0 [18.0, 44.0] 0.112 nonnorm 20.1
Aspartate aminotransferase (AST) (median [IQR]) 37.0 [25.0, 58.0] 35.0 [25.0, 54.0] 46.0 [30.0, 73.0] <0.001 nonnorm 21.0
C-reactive protein (CRP) (median [IQR]) 79.1 [34.0, 140.0] 72.2 [30.0, 130.0] 116.0 [63.0, 184.0] <0.001 nonnorm 38.7
Creatinine (median [IQR]) 1.0 [0.8, 1.4] 1.0 [0.8, 1.3] 1.3 [1.0, 2.1] <0.001 nonnorm 10.4
Ferritin (median [IQR]) 510.0 [224.0, 1080.0] 470.0 [207.0, 992.0] 747.5 [320.8, 1501.5] <0.001 nonnorm 43.6
Fibrin D-Dimer (median [IQR]) 750.0 [390.0, 1540.8] 692.5 [370.0, 1346.5] 1345.0 [668.2, 3315.0] <0.001 nonnorm 90.4
Lactate dehydrogenase (LDH) (median [IQR]) 321.0 [238.0, 441.0] 308.0 [232.0, 415.0] 404.0 [284.0, 556.5] <0.001 nonnorm 45.2
Lymphocyte count (median [IQR]) 1.0 [0.7, 1.4] 1.0 [0.7, 1.4] 0.8 [0.5, 1.1] <0.001 nonnorm 11.2
Neutrophil count (median [IQR]) 4.9 [3.4, 7.1] 4.7 [3.2, 6.7] 6.1 [4.1, 9.2] <0.001 nonnorm 11.2
Platelet count (PLT) (median [IQR]) 202.0 [157.0, 260.0] 205.0 [160.0, 262.0] 187.5 [143.0, 245.0] <0.001 nonnorm 9.8
Procalcitonin (median [IQR]) 0.1 [0.1, 0.4] 0.1 [0.1, 0.3] 0.3 [0.1, 1.0] <0.001 nonnorm 49.3
Troponin I (median [IQR]) 0.0 [0.0, 0.0] 0.0 [0.0, 0.0] 0.0 [0.0, 0.1] <0.001 nonnorm 41.2
White blood cell count (WBC) (median [IQR]) 6.7 [4.9, 9.1] 6.5 [4.8, 8.7] 7.7 [5.6, 11.1] <0.001 nonnorm 9.7

3.6 Transformations of continuous variables

The functional form of the relationship between mortality and the continuous variables is assessed using a series of univariate fits. We mainly rely on visual inspection of the graphs but also report the Bayesian information criterion (BIC).

fit_univariate_logit <- function(var, data){
  make_f <- function(rhs){
    as.formula(paste("died", rhs, sep =" ~ "))
  }
  fit_logit <- function(f, data){
    glm(f, family = "binomial", data = data)
  }
  
  list(
    `Linear` = fit_logit(make_f(var), data),
    `Spline 3 knots` = fit_logit(make_f(sprintf("ns(%s, 5)", var)), data),
    `Spline 4 knots` = fit_logit(make_f(sprintf("ns(%s, 6)", var)), data),
    `Spline 5 knots` = fit_logit(make_f(sprintf("ns(%s, 7)", var)), data)
  )
}

predict_univariate_logit <- function(models, var, var_values, type = "response"){
  newdata <- data.frame(var = var_values)
  colnames(newdata) <- var
  map_dfc(models,
         function(x) predict(x, newdata = newdata, type = type)) %>%
  mutate(var = var_values) %>%
  pivot_longer(cols = names(models),
               names_to = "Model",
               values_to = "y")
}

midpoint <- function(x, digits = 2){
  lower <- as.numeric(gsub(",.*", "", gsub("\\(|\\[|\\)|\\]", "", x)))
  upper <- as.numeric(gsub(".*,", "", gsub("\\(|\\[|\\)|\\]", "", x)))
  return(round(lower+(upper-lower)/2, digits))
}

bin_y <- function(var, var_values){
  data <- train_data[, c(var, "died")] %>%
    filter(!is.na(get(var)))
  data <- data %>%
    mutate(x_cat = cut(get(var), breaks = 20),
           x_midpoint = midpoint(x_cat)) %>%
    group_by(x_midpoint) %>%
    summarise(y = mean(died),
              n = n())
  colnames(data)[1] <- var
  return(data)
}

plot_univariate_logit <- function(models, var, var_values, var_lab = "Variable",
                                  type = "response", ylab = "Probability of death"){
  # Plotting data
  predicted_probs <- predict_univariate_logit(models, var, var_values, type = type)
  ylab <- switch(type,
                 "link" = "Log odds",
                 "response" = "Probability of death",
                 stop("Type must be 'link' or 'response'")
  )
  binned_y <- bin_y(var, var_values)
  if (type == "link"){
    binned_y$y <- ifelse(binned_y$y == 0, .001, binned_y$y)
    binned_y$y <- ifelse(binned_y$y == 1, .99, binned_y$y)
    binned_y$y <- qlogis(binned_y$y)
  }
  
  # Plotting scales
  y_min <- min(c(binned_y$y, predicted_probs$y))
  y_max <- max(c(binned_y$y, predicted_probs$y))
  size_breaks <- seq(min(binned_y$n), max(binned_y$n),
                     length.out = 6)
  
  # Plot
  ggplot(predicted_probs,
       aes(x = var, y = y)) +
    geom_line() +
    geom_point(data = binned_y, aes_string(x = var, y = "y", size = "n")) +
    facet_wrap(~Model, ncol = 2) +
    xlab(var_lab) +
    ylab(ylab) +
    ylim(floor(y_min), ceiling(y_max)) +
    scale_size(name = "Sample size", range = c(0.3, 3), 
               breaks = round(size_breaks, 0)) 
}

make_seq <- function(var){
  var_min <- min(train_data[[var]], na.rm = TRUE)
  var_max <- max(train_data[[var]], na.rm = TRUE)
  seq(var_min, var_max, length.out = 100)
}

evaluate_univariate_logit <- function(var, print = TRUE){
  var_values = make_seq(var)
  var_lab = get_var_labs(var)
  
  # Do evaluations
  fits <- fit_univariate_logit(var, data = train_data)
  p_link <- plot_univariate_logit(fits, var, var_values, var_lab, type = "link")
  p_probs <- plot_univariate_logit(fits, var, var_values, var_lab, type = "response")
  bic <- sapply(fits, BIC)
  
  # Print and return
  if (print){
    print(p_link)
    print(p_probs)
    print(bic)
  }
  return(list(fits = fits, p_link = p_link, p_probs = p_probs,
              bic = bic))
}

3.6.1 Demographics

3.6.1.1 Age

uv_age <- evaluate_univariate_logit("age")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10153.39       10179.96       10189.56       10198.91

3.6.1.2 Calendar time

uv_calendar_time <- evaluate_univariate_logit("calendar_time")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11927.48       11947.86       11956.50       11965.07

3.6.2 Vitals

3.6.2.1 Body Mass Index

uv_bmi <- evaluate_univariate_logit("bmi")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10634.18       10628.07       10636.99       10642.34

3.6.2.2 Diastolic blood pressure

uv_dbp <- evaluate_univariate_logit("dbp")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11072.71       11064.10       11073.56       11082.72

3.6.2.3 Systolic blood pressure

uv_sbp <- evaluate_univariate_logit("sbp")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11463.15       11244.37       11253.16       11262.85

3.6.2.4 Heart rate

uv_hr <- evaluate_univariate_logit("hr")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11501.00       11406.44       11414.25       11422.72

3.6.2.5 Respiration rate

uv_resp <- evaluate_univariate_logit("resp")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10747.57       10511.21       10520.03       10528.45

3.6.2.6 Oxygen saturation (%)

uv_spo2 <- evaluate_univariate_logit("spo2")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11141.19       11109.15       11118.68       11128.31

3.6.2.7 Temperature

uv_temp <- evaluate_univariate_logit("temp")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11594.91       11352.92       11360.11       11367.23

3.6.3 Labs

3.6.3.1 Alanine aminotransferase (U/L)

uv_alt <- evaluate_univariate_logit("alt")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10129.18       10159.31       10166.55       10172.47
uv_alt_t <- evaluate_univariate_logit("alt_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10127.15       10161.10       10170.62       10178.92

3.6.3.2 Alanine aminotransferase (U/L)

uv_ast <- evaluate_univariate_logit("ast")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       9998.672       9837.378       9844.663       9852.407
uv_ast_t <- evaluate_univariate_logit("ast_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       9850.176       9840.492       9849.651       9858.934

3.6.3.3 C-reactive protein (mg/L)

uv_crp <- evaluate_univariate_logit("crp")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       7611.060       7598.858       7607.186       7613.877
uv_crp_t <- evaluate_univariate_logit("crp_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       7609.387       7598.807       7607.153       7613.778

3.6.3.4 Creatinine (mg/dL)

uv_creatinine <- evaluate_univariate_logit("creatinine")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10993.75       10514.41       10503.31       10504.14
uv_creatinine_t <- evaluate_univariate_logit("creatinine_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10601.63       10480.42       10488.51       10495.10

3.6.3.5 Ferritin (ng/mL)

uv_ferritin <- evaluate_univariate_logit("ferritin")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       7309.122       7198.348       7208.925       7215.448
uv_ferritin_t <- evaluate_univariate_logit("ferritin_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       7214.138       7200.565       7211.500       7216.768

3.6.3.6 Fibrin D-Dimer (ng/mL)

uv_d_dimer <- evaluate_univariate_logit("d_dimer")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       1201.612       1189.221       1193.070       1197.217
uv_d_dimer_t <- evaluate_univariate_logit("d_dimer_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       1154.281       1177.857       1184.975       1192.032

3.6.3.7 Lactate dehydrogenase (U/L)

uv_ldh <- evaluate_univariate_logit("ldh")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       6916.380       6727.567       6730.615       6736.431
uv_ldh_t <- evaluate_univariate_logit("ldh_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       6722.740       6711.902       6720.931       6729.386

3.6.3.8 Lymphocyte count (10^3/uL)

uv_lymphocyte <- evaluate_univariate_logit("lymphocyte")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11021.34       10676.61       10684.69       10693.36
uv_lymphocyte_t <- evaluate_univariate_logit("lymphocyte_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10839.18       10669.88       10677.76       10686.35

3.6.3.9 Neutrophil count (10^3/uL)

uv_neutrophil <- evaluate_univariate_logit("neutrophil")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10741.03       10722.67       10723.14       10730.54
uv_neutrophil_t <- evaluate_univariate_logit("neutrophil_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10709.62       10710.56       10716.35       10724.36

3.6.3.10 Procalcitonin (ng/mL)

uv_pct <- evaluate_univariate_logit("pct")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       6956.604       6495.873       6495.437       6501.271
uv_pct_t <- evaluate_univariate_logit("pct_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       6565.428       6491.765       6493.818       6502.947

3.6.3.11 Troponin I (ng/mL)

uv_tni <- evaluate_univariate_logit("tni")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       7750.579       7043.096       7048.626       7057.567
uv_tni_t <- evaluate_univariate_logit("tni_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       7074.305       6951.225       6959.737       6968.103

3.6.3.12 Platelet count (10^3/uL)

uv_plt <- evaluate_univariate_logit("plt")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11102.84       11079.15       11086.55       11095.10
uv_plt_t <- evaluate_univariate_logit("plt_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       11099.10       11078.61       11086.35       11094.90

3.6.3.13 White blood cell count (10^3/uL)

uv_wbc <- evaluate_univariate_logit("wbc")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10987.01       10940.40       10944.79       10951.35
uv_wbc_t <- evaluate_univariate_logit("wbc_t")

##         Linear Spline 3 knots Spline 4 knots Spline 5 knots 
##       10934.25       10925.33       10933.11       10940.05

4 Data preprocessing

4.1 Candidate variables for modeling

Variables that will be candidates for inclusion in our model and used during variable selection are specified here. We will combine race and ethnicity into a single variable, but will wait to do this until after multiple imputation (see next section). Alanine aminotransferase (ALT) will also be exclued due to (i) a strong correlation with AST (0.83) and weak univariate associations with mortality (see above).

demographics_to_include <- c("age", "sex", "race", "ethnicity", "division",
                             "smoke", "calendar_time")
comorbidities_to_include <- comorbidity_cat_vars %>%
  filter(!var %in% c("diabunc", "diabwc", "hypunc", "hypc")) %>%
  pull(var) %>%
  c("diab", "hyp")
vitals_to_include <- c("bmi", "temp", "hr", "resp", "spo2", "sbp")
labs_to_include <- c("crp_t", "tni_t", "ast_t", "ferritin_t",
                     "creatinine_t", "ldh_t", "lymphocyte_t", "neutrophil_t",
                     "plt_t", "wbc_t")
vars <- vars %>%
  mutate(include = ifelse(var %in% c(demographics_to_include, 
                                     comorbidities_to_include,
                                     vitals_to_include, 
                                     labs_to_include),
                          1, 0))
get_included_vars <- function(){
  vars[vars$include == 1, ]$var
}

make_rhs <- function(vars){
  as.formula(paste0("~", paste(vars, collapse = " + ")))
}
candidate_model_rhs <- make_rhs(get_included_vars()) 

4.2 Missing data imputation

Let’s re-level variables so that we have preferred reference categories.

train_data <- train_data %>% mutate(
  sex = relevel(factor(sex), ref = "Male"),
  division = relevel(factor(division), ref = "Pacific"),
  smoke = relevel(factor(smoke), ref = "Never"),
  bmi_cat = relevel(factor(bmi_cat), ref = "Normal")
)

We can now multiply impute data use MICE.

# Run MICE algorithm
mice_out <- train_data %>%
  select(c(one_of(get_included_vars()))) %>%
  mutate_if(is.character, as.factor) %>%
  mice(m = n_imputations, maxit = 5) 
## 
##  iter imp variable
##   1   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
## Save variables for test set imputation
mice_vars <- c(get_included_vars())

# Append datasets and add death
mi_df <- complete(mice_out, action = "long", include = TRUE) %>%
  as_tibble() %>%
  mutate(died = rep(train_data$died, mice_out$m + 1))

# To compare MICE to aregImpute
# areg_out <-  aregImpute(update.formula(candidate_model_rhs, ~.), 
#                                        n.impute = 2, data = train_data)

4.3 Imputation diagnostics

The distributions of the imputed and observed data are compared as a diagnostic for the imputation. They look pretty similar suggesting that there is nothing terribly wrong with the imputation.

make_imp_df <- function(object){
  # Get imputations
  if (inherits(object, "mids")){
    imp <- object$imp
  } else{ # aregImpute
   imp <- object$imputed 
   for (i in 1:length(imp)){
     cat_levels_i <- object$cat.levels[[i]]
     if (!is.null(cat_levels_i) && !is.null(imp[[i]])){
       levels <- sort(unique(c(imp[[i]])))
       imp[[i]] <- apply(imp[[i]], 
                         2, 
                         function(x) factor(x, levels = levels,
                                            labels = cat_levels_i))
     }
   }
  }
  
  # Create list of data frames
  is_numeric <- sapply(imp, function (x) is.numeric(x[, 1]))
  continuous_df <- vector(mode = "list", length = sum(is_numeric))
  cat_df <- vector(mode = "list", length = sum(!is_numeric))
  continuous_cntr <- 1
  cat_cntr <- 1
  for (i in 1:length(imp)){
    if(!is.null(nrow(imp[[i]])) && nrow(imp[[i]]) > 0 ){
        imp_i_df <- data.frame(var = names(imp)[i],
                               imp = rep(1:ncol(imp[[i]]), each = nrow(imp[[i]])),
                               value = c(as.matrix(imp[[i]]))) %>% 
          as_tibble()
        
    } else{
      imp_i_df <- NULL
    }
    if (is_numeric[i]){
      continuous_df[[continuous_cntr]] <- imp_i_df
      continuous_cntr <- continuous_cntr + 1
    } else{
      cat_df[[cat_cntr]] <- imp_i_df
      cat_cntr <- cat_cntr + 1
    }
  } 
  
  # Row bind data frames
  continuous_df = bind_rows(continuous_df) %>%
    mutate(obs = "Imputed",
           varlab = get_var_labs(var)) 
  
  cat_df = bind_rows(cat_df) %>%
    mutate(obs = "Imputed",
           varlab = get_var_labs(var)) 
              
  # Return
  return(list(continuous = continuous_df,
              cat = cat_df))
}
imp_df <- make_imp_df(mice_out)
#imp_df <- make_imp_df(areg_out)
# Plot continuous variables
## Data for plotting
obsimp_df_continuous <- bind_rows(
  imp_df$continuous,
  continuous_var_df %>%
    select(var, value, varlab) %>%
    mutate(imp = 0, obs = "Observed")
) %>%
  mutate(imp = ifelse(imp == 0, "Observed", paste0("Imputation ", imp))) %>%
  filter(var %in% unique(imp_df$continuous$var))
  
## Plot
ggplot(obsimp_df_continuous,
       aes(x = value, col = imp)) +
  geom_density(position = "jitter") +
  facet_wrap(~varlab, scales = "free", ncol = 3) +
  xlab("") + ylab("Density") +
  scale_color_discrete(name = "") +
  theme(legend.position = "bottom")

# Plot categorical variables
## Data for plotting
obsimp_df_cat <- 
  bind_rows(
    imp_df$cat %>%
      group_by(var, varlab, value, imp) %>%
      summarise(n = n()) %>%
      group_by(var, varlab, imp) %>%
      mutate(freq = n / sum(n)),
    cat_var_df %>%
      select(var, value, varlab, n, freq) %>%
      mutate(imp = 0, obs = "Observed")
  ) %>%
    mutate(imp = ifelse(imp == 0, "Observed", paste0("Imputation ", imp))) %>%
    filter(var %in% unique(imp_df$cat$var))
## `summarise()` regrouping output by 'var', 'varlab', 'value' (override with `.groups` argument)
# Plot
ggplot(obsimp_df_cat, 
       aes(x = value, y = freq, fill = imp)) +
  geom_bar(position = "dodge", stat = "identity") +
  facet_wrap(~varlab, scales = "free_x") +
  scale_fill_discrete(name = "") +
  xlab("") +
  ylab("Proportion") +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

4.4 Recoding categorical variables

We create the race/ethnicity variable after imputation of missing values of race and ethnicity. We will also combine the diabetes and hypertension variables.

mi_df <- mi_df %>% 
  add_race_ethnicity() 

prop.table(table(mi_df %>% filter(.imp == 0) %>% pull(diab)))
## 
##        No       Yes 
## 0.6623224 0.3376776
prop.table(table(mi_df %>% filter(.imp == 0) %>% pull(hyp)))
## 
##        No       Yes 
## 0.4140431 0.5859569

Now lets use the combined race and ethnicity category in our model.

vars <- vars %>%
  mutate(include = case_when(
      var == "race_ethnicity" ~ 1,
      var == "race" ~ 0,
      var == "ethnicity" ~ 0,
      TRUE ~ include
    )
  )
  
candidate_model_rhs <- make_rhs(get_included_vars())

Finally, we will (i) add the new race/ethnicity variable to the “MICE” object and (ii) create a list of imputed datasets for analysis.

mice_out <- as.mids(mi_df)
mi_list <- mi_df %>% 
  filter(.imp > 0) %>% 
  split(list(.$.imp))

5 Variable selection

The continuous variables in our model are transformed using restricted cubic splines with 3 knots.

candidate_model_rhs <- candidate_model_rhs %>%
  update.formula( ~. + rcs(age, 3) - age +
                       rcs(calendar_time, 3) - calendar_time +
                       rcs(bmi, 3) - bmi +
                       rcs(temp, 3) - temp +
                       rcs(hr, 3) - hr +
                       rcs(resp, 3) - resp +
                       rcs(sbp, 3) - sbp +
                       rcs(spo2, 3) - spo2 +
                       rcs(crp_t, 3) - crp_t +
                       rcs(tni_t, 3) - tni_t +
                       rcs(ast_t, 3) - ast_t +
                       rcs(creatinine_t, 3) - creatinine_t +
                       rcs(ferritin_t, 3) - ferritin_t +
                       rcs(ldh_t, 3) - ldh_t,
                       rcs(lymphocyte_t, 3) - lymphocyte_t +
                       rcs(neutrophil_t, 3) - neutrophil_t +
                       rcs(plt_t, 3) - plt_t +
                       rcs(wbc_t, 3) - wbc_t)

With oem we need to create an x matrix since there is no formula interface.

rename_rcs <- function(v){
  rcs_ind <- grep("rcs", v)
  v[rcs_ind] <- sub("rcs.*)", "", v[rcs_ind])
  return(v)
}

rename_terms <- function(v){
  v <- gsub(" ", "_", v)
  v <- gsub("-", "", v)
  v <- gsub("/", "", v)
  v <- rename_rcs(v)
  return(v)
}

make_x <- function(data, rhs){
  x <- model.matrix(rhs, data)
  assign <- attr(x, "assign")
  colnames(x) <- rename_terms(colnames(x))
  x <- x[, -1]
  attr(x, "assign") <- assign[-1]
  return(x)
}

# List of x and y for each imputed dataset
x <- mi_list %>% map(function(data) make_x(data, candidate_model_rhs))
y <- mi_list %>% map(function(x) x[["died"]])

To make the graphs look nice, we will import labels for the model terms.

terms <- read.csv("risk-factors-terms.csv") %>%
  left_join(vars[, c("var", "group")], by = "var")

get_term_labs <- function(v, term_name = "term"){
  terms$termlab[match(v, terms[[term_name]])]
}

match_terms_to_vars <- function(t){
  terms$var[match(t, terms$term)]
}

Finally, we run the grouped lasso.

# Number of folds for cross-validation
n_folds <- 10

# Threshold for variable inclusion
inclusion_threshold <- 0.9

# Matrix to store inclusion results
inclusion_sim <- matrix(0,  ncol = ncol(x[[1]]) + 1, 
                        nrow = n_rep * n_imputations)

# Groups
groups <- attr(x[[1]], "assign") 

# Convenience function to extract coefficients from Group-Lasso
coef_cv_oem <- function(object){
  coef <- object$oem.fit$beta$grp.lasso
  lse_ind <- which(object$lambda[[1]] == object$lambda.1se.models)
  return(coef[, lse_ind])
}

# Variable selection via Group-Lasso
cntr <- 1
for (i in 1:n_imputations){
  for (j in 1:n_rep){
    
    # Cross-validation
    oem_cvfit <- cv.oem(x = x[[i]], y = y[[i]], 
                        penalty = "grp.lasso", 
                        groups = groups, 
                        family = "binomial",
                        type.measure = "deviance",
                        nfolds = n_folds
                        )
  
    # Count nonzero coefficients 
    inclusion_sim[cntr, which(coef_cv_oem(oem_cvfit) != 0)] <- 1
    
    # Iterate
    cntr <- cntr + 1
  } # End repeated CV loop
} # End imputation loop 

Let’s check the inclusion probabilities from the above repeated cross-validation steps.

# Percentage of simulations each term is included
inclusion_sim <- inclusion_sim[, -1] # Remove intercept
colnames(inclusion_sim) = colnames(x[[1]])

inclusion_summary <- tibble(term = colnames(inclusion_sim), 
                            prob = apply(inclusion_sim, 2, mean)) 
model_terms <- inclusion_summary %>%
  filter(prob >= inclusion_threshold) %>%
  pull(term)

# Percentage of simulations each variable is included
inclusion_summary <- inclusion_summary %>%
  mutate(var = match_terms_to_vars(term)) %>%
  mutate(varlab = get_var_labs(var)) %>% 
  distinct(prob, var, varlab)

# Plot
ggplot(inclusion_summary,
       aes(x = reorder(varlab, prob), y = prob)) +
  geom_bar(stat = "identity") + 
  geom_hline(yintercept = inclusion_threshold, linetype = "dotted", 
                color = "red", size = 1) +
  ylab("Probability of inclusion") +
  coord_flip() +
  theme(axis.title.y = element_blank())

6 Model interpretation

6.1 Model terms and data

Variables are included in the model based on the group Lasso simulation implemented above.

vars_to_exclude <- inclusion_summary %>%
  filter(prob < inclusion_threshold) %>%
  pull(var) %>%
  setdiff("sex") # Keep sex even if not picked by group lasso

remove_terms_from_rhs <- function(f, vars_to_exclude){
  # First convert formula to string separated by +
  f_string <- Reduce(paste, deparse(f))
  f_string <- gsub("~", "", f_string)
  f_string <- gsub(" ", "", f_string)
  
  # Then convert string to vector
  f_vec <-   unlist(strsplit(f_string, "\\+"))
  pattern_to_exclude <- paste(vars_to_exclude, collapse = "|")
  f_vec <- f_vec[!grepl(pattern_to_exclude, f_vec)]
  
  # Convert string back to formula
  f_new <- paste0("~", paste(f_vec, collapse = " + "))
  return(as.formula(f_new))
}

model_rhs <- remove_terms_from_rhs(candidate_model_rhs, vars_to_exclude)
label(mi_df) <- map(colnames(mi_df), 
                    function(x) label(mi_df[, x]) <- get_var_labs(x))
dd <- datadist(mi_df, adjto.cat = "first")
options(datadist = "dd")

6.2 Fit

As described in the paper, five models are fit that include the following predictors: (i) age only, (ii) comorbidities only, (iii) all demographics (and calendar time), and (iv) demographics (and calendar time) and comorbidities, and (v) all variables selected by the grouped lasso.

# The four models
lrm_names <- c("Age only",
               "Comorbidities only",
               "All demographics",
               "Demographics and comorbidities",
               "All variables")

## (1): fit_age: Only includes age
f_lrm_age <- died ~ age 
lrm_fit_age <- fit.mult.impute(f_lrm_age, fitter = lrm, xtrans = mice_out, pr = FALSE,
                               x = TRUE, y = TRUE)

## (2): fit_c: Only includes comorbidities
c_vars <- vars %>%
  filter(group == "Comorbidities" & include == 1 & !var %in% vars_to_exclude) %>%
  pull(var)
f_lrm_c <- as.formula(paste0("died ~", paste(c_vars, collapse = "+")))
lrm_fit_c <- fit.mult.impute(f_lrm_c, fitter = lrm, xtrans = mice_out, 
                             pr = FALSE, x = TRUE, y = TRUE) 

## (3): fit_d: All demographics including age
f_lrm_d <- update.formula(f_lrm_age, ~. + sex + rcs(calendar_time, 3) + 
                          race_ethnicity + division)
lrm_fit_d <- fit.mult.impute(f_lrm_d, fitter = lrm, xtrans = mice_out, pr = FALSE,
                             x = TRUE, y = TRUE)

## (4): fit_dc: Demographics and comorbidities
f_lrm_dc <- update.formula(f_lrm_d, 
                           as.formula(paste0("~.+", paste(c_vars, collapse = "+"))))
lrm_fit_dc <- fit.mult.impute(f_lrm_dc, fitter = lrm, xtrans = mice_out, pr = FALSE,
                              x = TRUE, y = TRUE)

## (5): fit_all: The main model including demographics, comorbidities, vitals, and labs
f_lrm_all <- update.formula(model_rhs, died ~ .)
lrm_fit_all <- fit.mult.impute(f_lrm_all, fitter = lrm, xtrans = mice_out, 
                               pr = FALSE, x = TRUE, y = TRUE)

### Note that we can also estimate the models with stats::glm
glm_fits_all <- mi_list %>%
  map(function (x) glm(f_lrm_all, data = x, family = "binomial")) 
glm_fit_all <- glm_fits_all %>% pool()

We will print the results of the full model for the interested reader

lrm_fit_all
## Logistic Regression Model
##  
##  fit.mult.impute(formula = f_lrm_all, fitter = lrm, xtrans = mice_out, 
##      pr = FALSE, x = TRUE, y = TRUE)
##  
##                          Model Likelihood    Discrimination    Rank Discrim.    
##                                Ratio Test           Indexes          Indexes    
##  Obs          13658    LR chi2    3952.05    R2       0.431    C       0.880    
##   0           11495    d.f.            59    g        2.289    Dxy     0.761    
##   1            2163    Pr(> chi2) <0.0001    gr       9.866    gamma   0.761    
##  max |deriv| 0.0001                          gp       0.200    tau-a   0.203    
##                                              Brier    0.091                     
##  
##                                    Coef     S.E.    Wald Z Pr(>|Z|)
##  Intercept                          15.5496  2.6631  5.84  <0.0001 
##  sex=Female                         -0.0426  0.0659 -0.65  0.5175  
##  division=East North Central        -0.3179  0.1682 -1.89  0.0587  
##  division=Middle Atlantic           -0.1768  0.1652 -1.07  0.2845  
##  division=New England               -0.0930  0.1802 -0.52  0.6058  
##  division=Other                     -0.6977  0.3752 -1.86  0.0629  
##  division=South Atl/West South Crl  -0.4371  0.2367 -1.85  0.0649  
##  division=West North Central        -0.3105  0.1822 -1.70  0.0883  
##  smoke=Current                      -0.0249  0.1689 -0.15  0.8827  
##  smoke=Previous                      0.0899  0.0728  1.24  0.2168  
##  race_ethnicity=Asian               -0.0926  0.1930 -0.48  0.6315  
##  race_ethnicity=Hispanic            -0.1482  0.1093 -1.36  0.1752  
##  race_ethnicity=Non-Hispanic black  -0.4171  0.0806 -5.17  <0.0001 
##  ami=Yes                             0.1239  0.0824  1.50  0.1326  
##  chf=Yes                             0.2168  0.0763  2.84  0.0045  
##  pvd=Yes                             0.0740  0.0802  0.92  0.3567  
##  cevd=Yes                            0.0553  0.0863  0.64  0.5215  
##  dementia=Yes                        0.3973  0.0828  4.80  <0.0001 
##  cpd=Yes                             0.0110  0.0704  0.16  0.8761  
##  rheumd=Yes                          0.0566  0.1549  0.37  0.7151  
##  mld=Yes                             0.0873  0.1198  0.73  0.4660  
##  hp=Yes                              0.6987  0.1556  4.49  <0.0001 
##  rend=Yes                            0.0981  0.0816  1.20  0.2295  
##  canc=Yes                            0.0249  0.0842  0.30  0.7671  
##  msld=Yes                            0.7703  0.2521  3.05  0.0023  
##  metacanc=Yes                        0.8451  0.1731  4.88  <0.0001 
##  aids=Yes                            0.5107  0.3613  1.41  0.1575  
##  diab=Yes                            0.1006  0.0647  1.56  0.1198  
##  hyp=Yes                             0.1046  0.0779  1.34  0.1790  
##  lymphocyte_t                       -0.2399  0.0554 -4.33  <0.0001 
##  plt_t                              -0.0022  0.0004 -5.61  <0.0001 
##  wbc_t                               0.0534  0.0102  5.25  <0.0001 
##  age                                 0.0698  0.0043 16.07  <0.0001 
##  age'                               -0.0060  0.0054 -1.13  0.2592  
##  calendar_time                      -0.0151  0.0045 -3.32  0.0009  
##  calendar_time'                      0.0046  0.0077  0.61  0.5441  
##  bmi                                -0.0213  0.0097 -2.19  0.0287  
##  bmi'                                0.0431  0.0117  3.68  0.0002  
##  temp                               -0.2080  0.0525 -3.96  <0.0001 
##  temp'                               0.9269  0.0827 11.21  <0.0001 
##  hr                                  0.0020  0.0045  0.44  0.6609  
##  hr'                                 0.0110  0.0051  2.15  0.0319  
##  resp                                0.0376  0.0220  1.71  0.0871  
##  resp'                               0.0484  0.0245  1.98  0.0481  
##  sbp                                -0.0332  0.0036 -9.14  <0.0001 
##  sbp'                                0.0313  0.0044  7.06  <0.0001 
##  spo2                               -0.1308  0.0181 -7.23  <0.0001 
##  spo2'                               0.1255  0.0285  4.40  <0.0001 
##  crp_t                               0.0034  0.0018  1.92  0.0543  
##  crp_t'                             -0.0035  0.0022 -1.59  0.1127  
##  tni_t                               7.7244  3.8317  2.02  0.0438  
##  tni_t'                            -18.8123 15.0884 -1.25  0.2125  
##  ast_t                               0.0137  0.0043  3.18  0.0015  
##  ast_t'                             -0.0236  0.0078 -3.04  0.0024  
##  creatinine_t                        0.3529  0.1129  3.13  0.0018  
##  creatinine_t'                      -0.2331  0.1909 -1.22  0.2220  
##  ferritin_t                          0.0002  0.0002  0.87  0.3866  
##  ferritin_t'                        -0.0002  0.0004 -0.63  0.5314  
##  ldh_t                               0.0015  0.0007  2.06  0.0396  
##  ldh_t'                             -0.0005  0.0009 -0.55  0.5792  
## 

6.3 Collinearity

Let’s examine the extent to which our predictors are collinear: for categorical and continous variables, we use an anova model; for categorical and categorical variables, we use Cramer’s V; and for continous and continous variables, we use spearman correlation. (Note that this takes a long time to run and could probablty be made more efficient.)

## from https://stackoverflow.com/questions/52554336/plot-the-equivalent-of-correlation-matrix-for-factors-categorical-data-and-mi
mixed_assoc <- function(df, cor_method = "spearman", adjust_cramersv_bias = TRUE){
  df_comb <- expand.grid(names(df), names(df),  stringsAsFactors = F) %>% 
    set_names("X1", "X2")
  is_nominal <- function(x) inherits(x, c("factor", "character"))
  # https://community.rstudio.com/t/why-is-purr-is-numeric-deprecated/3559
  # https://github.com/r-lib/rlang/issues/781
  is_numeric <- function(x) { is.integer(x) || is_double(x)}
  f <- function(x_name, y_name) {
    x <-  pull(df, x_name)
    y <-  pull(df, y_name)
    result <- if(is_nominal(x) && is_nominal(y)){
        # use bias corrected cramersV as described in https://rdrr.io/cran/rcompanion/man/cramerV.html
        cv <- cramerV(as.character(x), as.character(y), 
                      bias.correct = adjust_cramersv_bias)
        data.frame(x_name, y_name, assoc = cv, type = "cramersV")
    } else if(is_numeric(x) && is_numeric(y)){
        correlation <- cor(x, y, method = cor_method, use = "complete.obs")
        data.frame(x_name, y_name, assoc = correlation, type = "correlation")
    } else if(is_numeric(x) && is_nominal(y)){
        # from https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable/124618#124618
        r_squared <- summary(lm(x ~ y))$r.squared
        data.frame(x_name, y_name, assoc = sqrt(r_squared), type = "anova")
    } else if(is_nominal(x) && is_numeric(y)){
        r_squared <- summary(lm(y ~ x))$r.squared
        data.frame(x_name, y_name, assoc = sqrt(r_squared), type = "anova")
    } else {
        warning(paste("unmatched column type combination: ", class(x), class(y)))
    }
    # finally add complete obs number and ratio to table
    result %>% 
      mutate(
        complete_obs_pairs = sum(!is.na(x) & !is.na(y)),
        complete_obs_ratio = complete_obs_pairs/length(x)) %>%
      rename(x = x_name, y = y_name)
  }
  # apply function to each variable combination
  map2_df(df_comb$X1, df_comb$X2, f)
}
# Create correlation matrix of associations
corr_mat <- mi_df %>% 
  filter(.imp == 1) %>% 
  select(any_of(get_included_vars())) %>%
  mixed_assoc() %>%
  select(x, y, assoc) %>%
  pivot_wider(names_from = y, values_from = assoc) %>%
  column_to_rownames("x") %>%
  as.matrix

# Make tile plot
m <- abs(corr_mat)
heatmap_df <- tibble(row = rownames(m)[row(m)], 
                     col = colnames(m)[col(m)], corr = c(m)) %>%
  mutate(row = get_var_labs(row),
         col = get_var_labs(col))
heatmap_df %>%
  ggplot(aes(x = row, y = col, fill = corr)) + 
  geom_tile() + 
  scale_fill_continuous("Correlation") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        axis.title = element_blank()) 

6.4 Variable importance

Variable importance is assessed with a Wald test using the rms::anova().

# Compute variable important with Wald chi-square
lrm_anova_all <- anova(lrm_fit_all)

# Plot the result
lrm_anova_all %>% 
  as_tibble() %>%
  mutate(var = gsub(" ", "", rownames(lrm_anova_all)),
         varlab = get_var_labs(var),
         value = as.double(`Chi-Square` - `d.f.`)) %>%
  filter(!var %in% c("TOTAL", "Nonlinear", "TOTALNONLINEAR")) %>%

  ggplot(aes(x = value, y = reorder(varlab, value))) +
  geom_point() +
  theme(axis.title.y = element_blank()) +
  xlab(expression(chi^2-df)) 

6.5 Summary of odds ratios

The model is summarized with odds ratios using rms::summary(). We will start by assessing the full model. Next, since labs and vitals may themselves be “caused” by comorbidities, we will see how the odds ratios for the comorbidities change after dropping labs and vitals from the model.

6.5.1 Full model

lrm_summary_all <- summary(lrm_fit_all)

# Odds ratios
format_or_range <- function(x, term){
  case_when(
    x < 10 ~ formatC(x, format = "f", digits = 2),
    term == "temp" ~ formatC(x, format = "f", digits = 1),
    TRUE ~ formatC(x, format = "f", digits = 0)
  )
}

make_tidy_or <- function(object, model_name = NULL){
  if (is.null(model_name)) model_name <- "Model"
  object %>%
    as.data.frame() %>%
    as_tibble() %>%
    mutate(term = rownames(object),
           High = format_or_range(High, term),
           Low = format_or_range(Low, term),
           termlab = get_term_labs(term, "term2"),
           termlab = ifelse(!is.na(`Diff.`), 
                            paste0(termlab, " - ", High, ":", Low),
                            termlab),
           or = exp(Effect),
           or_lower = as.double(exp(`Lower 0.95`)),
           or_upper = exp(`Upper 0.95`)) %>%
    filter(Type == 1) %>%
    select(term, termlab, or, or_lower, or_upper) %>%
    arrange(-or) %>%
    mutate(model = model_name)
}
lrm_or_all <- make_tidy_or(lrm_summary_all, "All variables") 

# Odds ratio plot
ggplot(lrm_or_all, 
       aes(x = or, y = reorder(termlab, or))) +
  geom_point() +
  geom_errorbarh(aes(xmax = or_upper, xmin = or_lower,
                     height = .2)) +
  geom_vline(xintercept = 1, linetype = "dashed", col = "grey") + 
  theme(axis.title.y = element_blank()) +
  xlab("Odds ratio")

6.5.2 Comparison of model with and without vitals + labs

lrm_summary_dc <- summary(lrm_fit_dc)
lrm_or_dc <- make_tidy_or(lrm_summary_dc, "Demographics + comorbidities")

# Odds ratio comparison plot
lrm_or_comp <- bind_rows(lrm_or_all, lrm_or_dc) %>%
  filter(term %in% 
           terms[terms$group %in% c("Demographics", "Comorbidities"), ]$term2) %>%
  mutate(termlab = factor(termlab),
         termlab = reorder(termlab, or, function (x) -mean(x)))

ggplot(lrm_or_comp, 
       aes(x = termlab, y = or, col = model)) +
  geom_point(position = position_dodge(width = 1)) +
  geom_errorbar(aes(ymax = or_upper, ymin = or_lower,
                     width = .2), position = position_dodge(width = 1)) +
  facet_wrap(~termlab, strip.position = "left", ncol = 1, scales = "free_y") + 
  geom_hline(yintercept = 1, linetype = "dashed") + 
  theme(axis.title.y = element_blank()) +
  scale_color_discrete(name = "Model") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        strip.text.y.left = element_text(hjust = 0, vjust = 1, 
                                         angle = 0, size = 8),
        legend.position = "bottom") +
  ylab("Odds ratio") +
  coord_flip()

6.6 Predicted log odds

One limitation of the odds ratio plots is that it is hard to examine the potentially non-linear relationships between mortality and the continuous variable. The rms::Predict() function is especially helpul in this regard. We use it to vary all of the predictors and plot predicted log odds across the different values of each predictor.

Each prediction is made with all other variables at their “Adjust to” value as specified with datadist() above.

t(dd$limits) %>%
  as_tibble() %>%
  mutate(Variable = get_var_labs(colnames(dd$limits))) %>%
  relocate(Variable, .before = "Low:effect") %>%
  filter(!is.na(Variable)) %>%
  arrange(Variable) %>%
  kable() %>%
  kable_styling()
Variable Low:effect Adjust to High:effect Low:prediction High:prediction Low High
Acute myocardial infarction NA No NA No Yes No Yes
Age 49 62 75 18 89 18 89
AIDS/HIV NA No NA No Yes No Yes
Aspartate aminotransferase (AST) 25 36 57 6 157 4 157
Body Mass Index (BMI) 25.51 29.69 35.15 13.08 94.84 11.84 149.50
C-reactive protein (CRP) 28.57 71.10 131.60 0.00 458.00 0.00 458.00
Calendar time 38 47 64 0 91 0 91
Cancer NA No NA No Yes No Yes
Cerebrovascular disease NA No NA No Yes No Yes
Chronic pulmonary disease NA No NA No Yes No Yes
Congestive heart failure NA No NA No Yes No Yes
Creatinine 0.80 1.00 1.39 0.19 3.24 0.19 3.24
Dementia NA No NA No Yes No Yes
Diabetes NA No NA No Yes No Yes
Ethnicity NA Hispanic NA Hispanic Not Hispanic Hispanic Not Hispanic
Ferritin 203 461 984 3 3648 3 3648
Geographic division NA Pacific NA Pacific West North Central Pacific West North Central
Heart rate 77.5 87.5 98.0 36.0 165.0 20.0 203.0
Hemiplegia or paraplegia NA No NA No Yes No Yes
Hypertension NA No NA No Yes No Yes
Lactate dehydrogenase (LDH) 227.00 304.00 420.00 24.99 1050.00 24.99 1050.00
Lymphocyte count 0.7 1.0 1.4 0.0 3.5 0.0 3.5
Metastatic cancer NA No NA No Yes No Yes
Mild liver disease NA No NA No Yes No Yes
Moderate/severe liver disease NA No NA No Yes No Yes
Neutrophil count 3.33 4.87 7.10 0.00 18.29 0.00 18.29
Oxygen saturation 94 96 98 34 100 26 100
Peptic ulcer disease NA No NA No Yes No Yes
Peripheral vascular disease NA No NA No Yes No Yes
Platelet count (PLT) 158 203 262 2 569 1 569
Race NA African American NA African American Caucasian African American Caucasian
Race/Ethnicity NA Non-Hispanic white NA Non-Hispanic white Non-Hispanic black Non-Hispanic white Non-Hispanic black
Renal disease NA No NA No Yes No Yes
Respiration rate 18.0 20.0 22.0 2.0 65.5 2.0 99.0
Rheumatoid disease NA No NA No Yes No Yes
Sex NA Male NA Male Female Male Female
Smoking NA Never NA Never Previous Never Previous
Systolic blood pressure 115 126 139 33 242 30 266
Temperature 36.7 37.0 37.4 16.0 41.8 16.0 42.2
Troponin I 0.01 0.01 0.04 0.00 0.17 0.00 0.17
White blood cell count (WBC) 4.91 6.70 9.10 0.21 21.70 0.20 21.70

Let’s make the predictions.

lrm_log_odds <- Predict(lrm_fit_all, ref.zero = TRUE)

# Get plotting data
p_log_odds <- ggplot(lrm_log_odds, sepdiscrete = "list")

# Continuous plot
log_odds_limit <- max(ceiling(c(abs(p_log_odds$continuous$data$lower), 
                                abs(p_log_odds$continuous$data$upper))))
log_odds_breaks <- seq(-log_odds_limit, log_odds_limit, 2)
p_log_odds_continuous <- p_log_odds$continuous$data %>%
  as_tibble() %>%
  mutate(varlab = get_var_labs(.predictor.)) %>%
  ggplot(aes(x = .xx., y = yhat)) +
  facet_wrap(~varlab, scales = "free_x", ncol = 4) +
  geom_line() +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey") +
  ylab("Log odds") +
  scale_y_continuous(breaks = log_odds_breaks,
                     limits = c(-log_odds_limit, log_odds_limit)) +
  theme(axis.title.x = element_blank(), 
        strip.text = element_text(size = 7))

# Discrete plot
log_odds_limit <- max(ceiling(c(abs(p_log_odds$discrete$data$lower), 
                                abs(p_log_odds$discrete$data$upper))))
log_odds_breaks <- seq(-log_odds_limit, log_odds_limit, 1)
p_log_odds_discrete <- p_log_odds$discrete$data %>%
  as_tibble() %>%
  mutate(varlab = get_var_labs(.predictor.)) %>%
  ggplot(aes(x = yhat, y = .xx.)) +
  facet_wrap(~varlab, scales = "free_y", ncol = 4) +
  geom_point(size = .9) +
  geom_errorbarh(aes(xmin = lower , xmax = upper, height = 0)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey") +
  xlab("Log odds") +
  scale_x_continuous(breaks = log_odds_breaks,
                     limits = c(-log_odds_limit, log_odds_limit)) +
  theme(axis.title.y = element_blank(),
        strip.text = element_text(size = 7))

# Combine plots
grid.arrange(p_log_odds_discrete, p_log_odds_continuous,
             heights = c(4, 6))

6.7 Predicted probabilities

The predicted log odds plot is a nice way to summarize non-linear effects, but it’s not the ultimate quantity of interest. We really care about predicted probabilities. But to compute a predicted probability we need to set all covariates in the model to certain values. Rather that choosing specific values (i.e., creating a representative patient), we will make predictions over a representative sample and average predictions across the sample.

# Make newdata
## Start with a random sample of patients
n_samples <- 1000
train_sample <- mi_list[[1]] %>% 
  filter(.imp > 0) %>%
  sample_n(size = n_samples)

We will write a general function to predict mortality as a function of (i) our fitted model, (ii) the representative sample, and (iii) the variables that we are varying.

expand_newdata <- function(data, vars_to_vary){
  varnames_to_vary <- names(vars_to_vary)
  expanded_vars_to_vary <- expand.grid(vars_to_vary)
  data %>% 
    select(-all_of(varnames_to_vary)) %>%
    crossing(expanded_vars_to_vary)
}

predict_mortality <- function(fit, newdata, vars_to_vary){
  expanded_newdata <- expand_newdata(newdata, vars_to_vary)
  fit_probs <- predict(fit, newdata = data.frame(expanded_newdata), 
                       type = "fitted", se.fit = FALSE)
  
  # Average predictions for variables to vary
  expanded_newdata %>%
    mutate(prob = fit_probs) %>%
    group_by(across(all_of(names(vars_to_vary)))) %>%
    summarise(prob = mean(prob))
}

predict_mortality_boot <- function(fit, newdata, vars_to_vary, B = 100){
  n_obs <- nrow(newdata)
  boot_probs <- vector(mode = "list", length = B)
  for (b in 1:B){
    indices <- sample(x = 1:n_obs, size = n_obs, replace = TRUE) 
    newdata_boot <- newdata[indices, ]
    boot_probs[[b]] <- predict_mortality(fit, newdata_boot, vars_to_vary)
    boot_probs[[b]][, "b"] <- b 
  }
  return(boot_probs %>% bind_rows())
}

6.7.1 Varying age

ages_to_vary  <- seq(min(train_data$age),  max(train_data$age), 1)

probs_age <- predict_mortality(lrm_fit_all, newdata = train_sample, 
                               vars_to_vary = list(age = ages_to_vary)) %>%
  rename(Age = age, `Mortality probability` = prob)
## `summarise()` ungrouping output (override with `.groups` argument)
# Table using DT package
datatable(probs_age, 
          rownames = FALSE,
          options = list(pageLength = 20)) %>%
  formatRound("Mortality probability", 4)
ggplot(probs_age, aes(x = `Age`, y = `Mortality probability`)) +
  geom_line() 

6.7.2 Varying calendar time

min_index_date <- min(train_data$index_date)
calendar_times_to_vary <- c(as.Date("2020-03-01"), 
                            as.Date("2020-04-01"), 
                            as.Date("2020-05-01")) - min_index_date 
calendar_times_to_vary <- as.numeric(calendar_times_to_vary)

probs_calendar_time <- predict_mortality(
  lrm_fit_all, newdata = train_sample, 
  vars_to_vary = list(calendar_time = calendar_times_to_vary)
) %>%
  mutate(Date = min_index_date + calendar_time) %>%
  rename(`Calendar time` = calendar_time, `Mortality probability` = prob) %>%
  relocate(Date)
## `summarise()` ungrouping output (override with `.groups` argument)
kable(probs_calendar_time) %>% kable_styling()
Date Calendar time Mortality probability
2020-03-01 10 0.2128545
2020-04-01 41 0.1661879
2020-05-01 71 0.1348575

6.7.3 Varying age and calendar time

We will start by predicting point estimates only.

# Predict point estimates
probs_age_calendar <- predict_mortality(
  lrm_fit_all, newdata = train_sample, 
  vars_to_vary = list(age = ages_to_vary,
                      calendar_time = calendar_times_to_vary))
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
# Summarize in table
probs_age_calendar %>%
  arrange(calendar_time, age) %>%
  mutate(Date = min_index_date + calendar_time) %>%
  select(-calendar_time) %>%
  relocate(Date) %>%
  rename(`Age` = age,
         `Mortality probability` = prob) %>%
  
  ## Using DT::datatable
  datatable(rownames = FALSE,
            filter = "top",
          options = list(pageLength = 20)) %>%
  formatRound("Mortality probability", 4)

We we will also compute 95% confidence intervals by bootstrapping.

bootprobs_age_calendar <- predict_mortality_boot(
  lrm_fit_all, newdata = train_sample, 
  vars_to_vary = list(age = ages_to_vary,
                      calendar_time = calendar_times_to_vary),
  B = n_boot_probs) 

Let’s plot the results.

bootprobs_age_calendar %>%
  # Summarize bootstrap results
  group_by(age, calendar_time) %>%
  summarise(
    prob_mean = mean(prob),
    prob_lower = quantile(prob, .025),
    prob_upper = quantile(prob, .975)
  ) %>%
  mutate(date = factor(min_index_date + calendar_time)) %>%

  # Plot
  ggplot(aes(x = age, y = prob_mean)) +
    geom_line(aes(color = date)) +
    geom_ribbon(aes(ymin = prob_lower, ymax = prob_upper, fill = date),
                alpha = 0.2) +
    xlab("Age") +
    ylab("Mortality probability") +
    scale_color_discrete("") +
    scale_fill_discrete("") +
    theme(legend.position = "bottom")
## `summarise()` regrouping output by 'age' (override with `.groups` argument)

7 Internal model validation

7.1 Optimism-adjusted model performance

We will start by using bootstrapping to estimate model performance and check for whether our in-sample fits are too optimistic. Specifically, we will use the following algorithm implemented in the rms package:

  1. Calculate performance from the model fit on the original training sample, denoted as \(\theta_{orig}\).
  2. For \(b = 1,\ldots,B\):
    1. Bootstrap (with replacement) from the original training sample.
    2. Fit the model to the bootstrapped data and estimate model performance, denoted as \(\theta_{train}\).
    3. Use the bootstrap fit to measure model performance on the original training sample, denoted as \(\theta_{test}\).
  3. Calculate “optimism” as \(O = B^{-1} \sum_{b = 1}^B \theta_{b, train} - \theta_{b, test}\).
  4. Calculate optimism-adjusted performance as \(\theta = \theta_{orig} - O\).

A shrinkage factor can also be estimated within each bootstrap sample to gauge the extent of overfitting. This is done by fitting \(g(Y) = \gamma_0 + \gamma_1 X\hat{\beta}\) where \(X\) and \(Y\) are the covariates and outcome, respectively, in the test sample (i.e., in step 2c) and \(\hat{\beta}\) is estimating in the training sample (i.e., step 2b). If there is no overfitting, then \(\gamma_0 = 0\) and \(\gamma_1 = 1\); conversely, if there is overfitting, then \(\gamma_1 < 1\) and \(\gamma_0 \neq 1\) to compensate.

lrm_val_age <- validate(lrm_fit_age, B = n_boot_val)
lrm_val_c <- validate(lrm_fit_c, B = n_boot_val) 
lrm_val_d <- validate(lrm_fit_d, B = n_boot_val)
lrm_val_dc <- validate(lrm_fit_dc, B = n_boot_val)
lrm_val_all <- validate(lrm_fit_all, B = n_boot_val) 
lrm_val_train <- list(lrm_val_age, lrm_val_c, lrm_val_d,
                      lrm_val_dc, lrm_val_all)
bind_cindex <- function(object){
  n_rows <- nrow(object)
  c_index <- (object["Dxy", 1:3] + 1)/2
  c_index[4] <- c_index[2] - c_index[3]
  c_index[5] <- c_index[1] - c_index[4]
  c_index[6] <- object[1, 6]
  
  return(rbind(object, c_index))
}

make_validation_tbl <- function(object){
  object %>% 
    bind_cindex() %>%
    set_colnames(c("(1) Original", "(2) Bootstrap training", 
                   "(3) Bootstrap test", "Optimism: (2) - (3)", 
                   "Original (corrected): (1) - (4)", "N")) %>%
    kable() %>%
    kable_styling()
}

make_validation_tbl(lrm_val_age)
  1. Original
  1. Bootstrap training
  1. Bootstrap test
Optimism: (2) - (3) Original (corrected): (1) - (4) N
Dxy 0.5491275 0.5497881 0.5491275 0.0006606 0.5484668 40
R2 0.2120752 0.2124468 0.2120752 0.0003716 0.2117036 40
Intercept 0.0000000 0.0000000 0.0018166 -0.0018166 0.0018166 40
Slope 1.0000000 1.0000000 0.9983903 0.0016097 0.9983903 40
Emax 0.0000000 0.0000000 0.0006742 0.0006742 0.0006742 40
D 0.1318283 0.1319522 0.1318283 0.0001239 0.1317043 40
U -0.0001464 -0.0001464 0.0000094 -0.0001558 0.0000094 40
Q 0.1319747 0.1320986 0.1318189 0.0002798 0.1316949 40
B 0.1160266 0.1157696 0.1160511 -0.0002815 0.1163081 40
g 1.3960983 1.3986006 1.3960983 0.0025023 1.3935960 40
gp 0.1458584 0.1457080 0.1458584 -0.0001504 0.1460088 40
c_index 0.7745637 0.7748941 0.7745637 0.0003303 0.7742334 40
make_validation_tbl(lrm_val_c) 
  1. Original
  1. Bootstrap training
  1. Bootstrap test
Optimism: (2) - (3) Original (corrected): (1) - (4) N
Dxy 0.4620303 0.4597392 0.4555836 0.0041556 0.4578747 40
R2 0.1382186 0.1399679 0.1364480 0.0035199 0.1346987 40
Intercept 0.0000000 0.0000000 -0.0202269 0.0202269 -0.0202269 40
Slope 1.0000000 1.0000000 0.9871831 0.0128169 0.9871831 40
Emax 0.0000000 0.0000000 0.0065711 0.0065711 0.0065711 40
D 0.0838930 0.0850194 0.0827716 0.0022478 0.0816452 40
U -0.0001464 -0.0001464 0.0000103 -0.0001567 0.0000103 40
Q 0.0840394 0.0851659 0.0827614 0.0024045 0.0816349 40
B 0.1215780 0.1213851 0.1217809 -0.0003959 0.1219738 40
g 0.7727543 0.7761374 0.7660445 0.0100929 0.7626614 40
gp 0.1097598 0.1102586 0.1089013 0.0013573 0.1084024 40
c_index 0.7310151 0.7298696 0.7277918 0.0020778 0.7289373 40
make_validation_tbl(lrm_val_all)
  1. Original
  1. Bootstrap training
  1. Bootstrap test
Optimism: (2) - (3) Original (corrected): (1) - (4) N
Dxy 0.7598518 0.7645541 0.7557407 0.0088134 0.7510385 40
R2 0.4312037 0.4370952 0.4248053 0.0122899 0.4189138 40
Intercept 0.0000000 0.0000000 -0.0373035 0.0373035 -0.0373035 40
Slope 1.0000000 1.0000000 0.9718907 0.0281093 0.9718907 40
Emax 0.0000000 0.0000000 0.0130555 0.0130555 0.0130555 40
D 0.2892845 0.2941307 0.2843173 0.0098134 0.2794711 40
U -0.0001464 -0.0001464 0.0001125 -0.0002589 0.0001125 40
Q 0.2894310 0.2942771 0.2842048 0.0100723 0.2793586 40
B 0.0905945 0.0898474 0.0913369 -0.0014894 0.0920839 40
g 2.2890365 2.3076809 2.2422073 0.0654736 2.2235629 40
gp 0.1999983 0.2014582 0.1985780 0.0028802 0.1971181 40
c_index 0.8799259 0.8822770 0.8778704 0.0044067 0.8755192 40

7.2 Calibration curves

We will first use the bootstrap so that we can geneate bias adjusted predictions.

# Calibrate
lrm_cal_age <- calibrate(lrm_fit_age, B = n_boot_val)
lrm_cal_c <- calibrate(lrm_fit_c, B = n_boot_val)
lrm_cal_d <- calibrate(lrm_fit_d, B = n_boot_val)
lrm_cal_dc <- calibrate(lrm_fit_dc, B = n_boot_val)
lrm_cal_all <- calibrate(lrm_fit_all, B = n_boot_val)
lrm_cal_list <- list(lrm_cal_age, lrm_cal_c, lrm_cal_d, lrm_cal_dc, lrm_cal_all)
names(lrm_cal_list) <- lrm_names

Calibration plots can then be created.

plot_calibration <- function(object){
  # Make tibble
  cal_df <- map2(object, names(object), function(x, y){
    x[, ] %>%
      as_tibble() %>%
      mutate(model = y)
  }) %>% 
    bind_rows() %>%
    mutate(model = factor(model, levels = lrm_names))
  
  
  # Plot
  breaks <- seq(0, 1, .2)

  ggplot() + 
    geom_line(data = cal_df, mapping = aes(x = predy, y = calibrated.orig, 
                                           color = "Apparent")) + 
    geom_line(data = cal_df, mapping = aes(x = predy, y = calibrated.corrected,
                                           color = "Bias-corrected")) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
    facet_wrap(~model) +
    scale_x_continuous(breaks = breaks, limits = c(0, 1)) +
    scale_y_continuous(breaks = breaks, limits = c(0, 1)) +
    xlab("Predicted probability") +
    ylab("Actual probability") +
    scale_colour_manual(name = "",
                        values = c("Apparent" = "black", 
                                   "Bias-corrected" = "red")) +
    theme(legend.position = "bottom") 
}

plot_calibration(lrm_cal_list)

Calibration at the tails of the predictive distribution might be based on fewer patients, so it can be useful to plot the distribution of predicted probabilities. We start with histograms.

predprobs_df <- map2(lrm_cal_list, names(lrm_cal_list), function (x, model_name){ 
  x_df <- tibble(model =model_name, predicted = attr(x, "predicted"))
  }) %>%
   bind_rows() %>%
  mutate(model = factor(model, levels = names(lrm_cal_list)))

plot_predprobs_hist <- function(data){
  ggplot(data, aes(x = predicted)) + 
    facet_wrap(~model, scales = "free_y") +
    geom_histogram(fill = "black", bins = 200) +
    scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
    scale_y_continuous(breaks = function (x) floor(seq(0, .9 * max(x), 
                                                       length.out = 7))) +
    xlab("Predicted Probability") +
    ylab("Count") 
}

plot_predprobs_hist(predprobs_df)
## Warning: Removed 10 rows containing missing values (geom_bar).

The cumulative density function (CDF) is also informative since it provides the proportion of patients with predicted probabilities above and below given thresholds. For example, we can see that in the full model only a proportion 0.09 have predicted probabilities above 0.5.

plot_predprobs_cdf <- function(data){
  ggplot(data, aes(x = predicted, col = model)) + 
    stat_ecdf() +
    scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
    scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
    xlab("Predicted Probability") +
    ylab("CDF") +
    scale_color_discrete(name = "") +
    theme(legend.position = "bottom")
}

plot_predprobs_cdf(predprobs_df)

8 Model validation on test set

The specification of the model has been finalized so we are ready to assess it on the test set.

8.1 Preprocess test data

To make predictions on the training set, we must preprocess the data in the same way that we preprocessed the training data. Multiple imputation will be performed on the combined train and test set.

test_data <- readRDS("test_data.rds")  %>%
  filter_ie()
## 183 (5.07%) patients were dropped due to the 2-week cutoff.
# Clean data
test_data <- clean_data(test_data)

# Truncate labs
test_data <- add_truncated_lab_vars(test_data, v = lab_vars$var)

# Missing data imputation
## rbind training and test sets
train_data$set = "train"
test_data$set = "test"
train_test_data <- bind_rows(train_data, test_data)

## Multiple imputation via MICE
mice_train_test <- train_test_data %>%
  select(one_of(mice_vars)) %>%
  mutate_if(is.character, as.factor) %>%
  mice(m = n_imputations, maxit = 5)
## 
##  iter imp variable
##   1   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   1   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   2   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   3   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   4   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   1  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   2  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   3  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   4  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
##   5   5  sex  race  ethnicity  division  smoke  bmi  sbp  hr  resp  temp  spo2  ast_t  crp_t  creatinine_t  ferritin_t  ldh_t  lymphocyte_t  neutrophil_t  tni_t  plt_t  wbc_t
## Add death and subset to test set
mi_df_test <- complete(mice_train_test, action = "long", include = FALSE) %>%
  as_tibble() %>%
  mutate(died = rep(train_test_data$died, mice_train_test$m),
         set = rep(train_test_data$set, mice_train_test$m)) %>%
  filter(set == "test")

## Create race/ethnicity variable
mi_df_test <-  add_race_ethnicity(mi_df_test)

Before making predictions, it is helpful to compare the training and test sets.

tbl1_train_test <- train_test_data %>%
  select(one_of("died", "set", tbl1_varnames)) %>%
  rename_with(get_var_labs, .cols = all_of(tbl1_varnames)) %>%
  rename(Died = died) %>%
  mutate(set = ifelse(set == "train", "Train set", "Test set")) %>%
  CreateTableOne(vars = c("Died", get_var_labs(tbl1_varnames)), 
                 factorVars = c("Died", get_var_labs(tbl1_cat_varnames)), 
                 strata = "set",  addOverall = TRUE, 
                 data = .)

print(tbl1_train_test, nonnormal = get_var_labs(tbl1_non_normal_varnames),  
      cramVars = c("Sex"),
      contDigits = 3, missing = TRUE, printToggle = FALSE) %>%
  set_colnames(c("Overall", "Test set", "Train set",
                "p", "Test", "Missing")) %>%
  kable() %>%
  kable_styling()
Overall Test set Train set p Test Missing
n 17086 3428 13658
Died = 1 (%) 2660 (15.6) 497 (14.5) 2163 (15.8) 0.057 0.0
Age (median [IQR]) 62.000 [49.000, 75.000] 62.000 [49.000, 74.000] 62.000 [49.000, 75.000] 0.416 nonnorm 0.0
Calendar time (median [IQR]) 47.000 [38.000, 64.000] 47.000 [37.000, 64.000] 47.000 [38.000, 64.000] 0.272 nonnorm 0.0
Geographic division (%) <0.001 2.7
East North Central 5835 (35.1) 1208 (36.3) 4627 (34.8)
Middle Atlantic 5900 (35.5) 1264 (38.0) 4636 (34.9)
New England 1915 (11.5) 332 (10.0) 1583 (11.9)
Other 238 ( 1.4) 47 ( 1.4) 191 ( 1.4)
Pacific 591 ( 3.6) 80 ( 2.4) 511 ( 3.8)
South Atl/West South Crl 455 ( 2.7) 91 ( 2.7) 364 ( 2.7)
West North Central 1690 (10.2) 306 ( 9.2) 1384 (10.4)
Race/Ethnicity (%) 0.069 26.0
Non-Hispanic white 7071 (56.0) 1424 (55.9) 5647 (56.0)
Asian 444 ( 3.5) 82 ( 3.2) 362 ( 3.6)
Hispanic 700 ( 5.5) 167 ( 6.6) 533 ( 5.3)
Non-Hispanic black 4423 (35.0) 876 (34.4) 3547 (35.2)
Sex = Female/Male (%) 8223/8857 (48.1/51.9) 1660/1766 (48.5/51.5) 6563/7091 (48.1/51.9) 0.700 0.0
Smoking (%) 0.367 25.7
Current 1078 ( 8.5) 212 ( 8.4) 866 ( 8.5)
Never 7714 (60.8) 1507 (59.7) 6207 (61.1)
Previous 3896 (30.7) 804 (31.9) 3092 (30.4)
Acute myocardial infarction = Yes (%) 1896 (11.1) 361 (10.5) 1535 (11.2) 0.250 0.0
AIDS/HIV = Yes (%) 136 ( 0.8) 35 ( 1.0) 101 ( 0.7) 0.121 0.0
Cancer = Yes (%) 2087 (12.2) 409 (11.9) 1678 (12.3) 0.591 0.0
Cerebrovascular disease = Yes (%) 1807 (10.6) 368 (10.7) 1439 (10.5) 0.758 0.0
Chronic pulmonary disease = Yes (%) 4623 (27.1) 996 (29.1) 3627 (26.6) 0.003 0.0
Congestive heart failure = Yes (%) 2921 (17.1) 596 (17.4) 2325 (17.0) 0.631 0.0
Dementia = Yes (%) 1706 (10.0) 312 ( 9.1) 1394 (10.2) 0.058 0.0
Diabetes = Yes (%) 5800 (33.9) 1188 (34.7) 4612 (33.8) 0.336 0.0
Hemiplegia or paraplegia = Yes (%) 419 ( 2.5) 89 ( 2.6) 330 ( 2.4) 0.584 0.0
Hypertension = Yes (%) 10021 (58.7) 2018 (58.9) 8003 (58.6) 0.787 0.0
Metastatic cancer = Yes (%) 355 ( 2.1) 78 ( 2.3) 277 ( 2.0) 0.401 0.0
Mild liver disease = Yes (%) 1096 ( 6.4) 217 ( 6.3) 879 ( 6.4) 0.852 0.0
Moderate/severe liver disease = Yes (%) 164 ( 1.0) 36 ( 1.1) 128 ( 0.9) 0.611 0.0
Peptic ulcer disease = Yes (%) 252 ( 1.5) 46 ( 1.3) 206 ( 1.5) 0.520 0.0
Peripheral vascular disease = Yes (%) 2103 (12.3) 432 (12.6) 1671 (12.2) 0.578 0.0
Renal disease = Yes (%) 3541 (20.7) 708 (20.7) 2833 (20.7) 0.927 0.0
Rheumatoid disease = Yes (%) 500 ( 2.9) 102 ( 3.0) 398 ( 2.9) 0.893 0.0
CCI (median [IQR]) 1.000 [0.000, 3.000] 1.000 [0.000, 3.000] 1.000 [0.000, 3.000] 0.124 nonnorm 0.0
Body Mass Index (BMI) (median [IQR]) 29.660 [25.510, 35.010] 29.560 [25.500, 34.623] 29.690 [25.510, 35.130] 0.339 nonnorm 12.0
Diastolic blood pressure (median [IQR]) 73.000 [65.500, 80.500] 73.000 [65.000, 80.500] 73.000 [65.500, 80.500] 0.803 nonnorm 3.5
Heart rate (median [IQR]) 87.500 [77.500, 98.000] 87.500 [78.000, 98.000] 87.500 [77.500, 98.000] 0.906 nonnorm 3.4
Oxygen saturation (median [IQR]) 96.000 [94.000, 98.000] 96.000 [94.000, 98.000] 96.000 [94.000, 98.000] 0.979 nonnorm 4.3
Respiration rate (median [IQR]) 20.000 [18.000, 22.000] 20.000 [18.000, 22.000] 20.000 [18.000, 22.000] 0.980 nonnorm 4.2
Systolic blood pressure (median [IQR]) 126.000 [115.000, 139.000] 126.000 [115.000, 139.000] 126.000 [115.000, 139.000] 0.663 nonnorm 3.5
Temperature (median [IQR]) 37.000 [36.700, 37.400] 37.000 [36.700, 37.400] 37.000 [36.700, 37.400] 0.200 nonnorm 3.5
Alanine aminotransferase (ALT) (median [IQR]) 28.000 [18.000, 46.000] 28.000 [18.000, 46.000] 28.000 [18.000, 46.000] 0.791 nonnorm 20.3
Aspartate aminotransferase (AST) (median [IQR]) 37.000 [25.000, 58.000] 38.000 [25.000, 59.000] 37.000 [25.000, 58.000] 0.383 nonnorm 21.3
C-reactive protein (CRP) (median [IQR]) 79.100 [33.800, 141.360] 79.900 [31.600, 147.083] 79.100 [34.000, 140.000] 0.615 nonnorm 39.3
Creatinine (median [IQR]) 1.000 [0.800, 1.415] 1.000 [0.800, 1.430] 1.000 [0.800, 1.410] 0.410 nonnorm 10.7
Ferritin (median [IQR]) 509.000 [223.000, 1082.500] 505.000 [220.000, 1085.000] 510.000 [224.000, 1080.000] 0.897 nonnorm 44.2
Fibrin D-Dimer (median [IQR]) 762.000 [405.000, 1600.000] 880.000 [460.500, 1810.000] 750.000 [390.000, 1540.750] 0.083 nonnorm 90.5
Lactate dehydrogenase (LDH) (median [IQR]) 321.000 [238.000, 441.000] 316.000 [237.000, 443.000] 321.000 [238.000, 441.000] 0.925 nonnorm 45.7
Lymphocyte count (median [IQR]) 0.990 [0.700, 1.400] 0.973 [0.700, 1.400] 1.000 [0.700, 1.400] 0.342 nonnorm 11.4
Neutrophil count (median [IQR]) 4.900 [3.400, 7.160] 4.940 [3.440, 7.330] 4.900 [3.370, 7.100] 0.083 nonnorm 11.5
Platelet count (PLT) (median [IQR]) 202.000 [157.000, 261.000] 203.000 [157.000, 264.000] 202.000 [157.000, 260.000] 0.438 nonnorm 10.1
Procalcitonin (median [IQR]) 0.130 [0.070, 0.370] 0.140 [0.070, 0.361] 0.130 [0.070, 0.370] 0.684 nonnorm 49.5
Troponin I (median [IQR]) 0.010 [0.010, 0.050] 0.010 [0.010, 0.050] 0.010 [0.010, 0.050] 0.243 nonnorm 41.2
White blood cell count (WBC) (median [IQR]) 6.700 [4.900, 9.130] 6.800 [5.000, 9.400] 6.700 [4.900, 9.100] 0.035 nonnorm 9.9

8.2 Predict

Our main predictions are based on the unpenalized logistic regression.

lrm_fits <- list(age = lrm_fit_age, 
                 c = lrm_fit_c,
                 d = lrm_fit_d,
                 dc = lrm_fit_dc,
                 all= lrm_fit_all)
lrm_prob_names <- paste0("lrm_probs_", names(lrm_fits))
for (i in 1:length(lrm_fits)){
  mi_df_test[[lrm_prob_names[i]]] <- predict(lrm_fits[[i]], 
                                             newdata = as.data.frame(mi_df_test), 
                                             type = "fitted", se.fit = FALSE)
}

8.3 Evaluation metrics

Let’s evaluate the models using the metrics returned by rms::val.prob().

lrm_val_test <- vector(mode = "list", length = length(lrm_fits))
names(lrm_val_test) <- lrm_names
for (i in 1:length(lrm_val_test)){
  lrm_val_test[[i]] <- val.prob(p = mi_df_test[[lrm_prob_names[i]]],
                                y = mi_df_test$died,
                                pl = FALSE)
}
summarize_val_probs <- function(object){
  map_df(object, function (x) {
  as_tibble(matrix(x, nrow = 1)) %>%
    set_colnames(names(x))
}, .id = "Model") %>%
  kable(digits = 4) %>%
  kable_styling() %>%
  scroll_box(width = "100%")
}
summarize_val_probs(lrm_val_test)
Model Dxy C (ROC) R2 D D:Chi-sq D:p U U:Chi-sq U:p Q Brier Intercept Slope Emax E90 Eavg S:z S:p
Age only 0.5116 0.7558 0.1725 0.1021 1751.513 NA 0.0015 27.3103 0e+00 0.1007 0.1114 -0.2091 0.9111 0.0687 0.0390 0.0135 -1.8440 0.0652
Comorbidities only 0.4372 0.7186 0.1187 0.0691 1185.986 NA 0.0009 17.4343 2e-04 0.0682 0.1151 -0.1358 0.9707 0.2426 0.0291 0.0216 -2.4338 0.0149
All demographics 0.5466 0.7733 0.2014 0.1203 2062.555 NA 0.0009 17.8625 1e-04 0.1194 0.1084 -0.1477 0.9550 0.0550 0.0240 0.0129 -2.7595 0.0058
Demographics and comorbidities 0.5816 0.7908 0.2273 0.1369 2346.750 NA 0.0009 17.8330 1e-04 0.1359 0.1063 -0.1421 0.9603 0.1537 0.0204 0.0109 -1.9601 0.0500
All variables 0.7424 0.8712 0.3897 0.2476 4245.021 NA 0.0021 38.1363 0e+00 0.2455 0.0892 -0.2066 0.9238 0.1438 0.0255 0.0127 0.4263 0.6699

We will focus on the Brier score and C-Index and assess performance on both the training and test sets.

summarize_performance <- function(val_train, val_test){
  # Get metrics
  train_metrics <- map_df(val_train, function (x) {
    x[c("Dxy", "B"), "index.orig"]
    }) %>%
    mutate(Dxy = (Dxy + 1)/2) %>%
    rename(c_index = Dxy)
  
  test_metrics <- map_df(val_test, function(x) x[c("C (ROC)", "Brier")])
  
  # Make table
  bind_cols(
    tibble(Model = names(val_test)),
    train_metrics,
    test_metrics
  ) %>%
    kable(col.names = c("Model", "C-Index", "Brier", "C-Index", "Brier"),
          digits = 4) %>%
    kable_styling() %>%
    add_header_above(c(" ", "Train" = 2, "Test" = 2))
}

summarize_performance(val_train = lrm_val_train, val_test = lrm_val_test)
Train
Test
Model C-Index Brier C-Index Brier
Age only 0.7746 0.1160 0.7558 0.1114
Comorbidities only 0.7310 0.1216 0.7186 0.1151
All demographics 0.7845 0.1145 0.7733 0.1084
Demographics and comorbidities 0.8019 0.1119 0.7908 0.1063
All variables 0.8799 0.0906 0.8712 0.0892

8.4 Calibration curves

Like we did with the training set, we will plot calibration curves.

make_cal_test_data <- function(data, p , model_names){
  data[, c("died", ".imp", p)] %>%
    pivot_longer(cols = c(p), names_to = "model",
               values_to = "predicted") %>%
    rename(actual = died) %>%
    mutate(model = model_names[match(model, p)],
          model = factor(model, levels = model_names))
}

plot_calibration_test <- function(cal_data){
  ggplot(cal_data, aes(x = predicted, y = actual)) +
    facet_wrap(~model) +
    geom_smooth(se = FALSE, method = "loess", formula = y ~ x, color = "black",
                size = 1) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
    scale_x_continuous(breaks = seq(0, 1, .2), limits = c(0, 1)) +
    scale_y_continuous(breaks = seq(0, 1, .2), limits = c(0, 1)) +
    xlab("Predicted probability") +
    ylab("Actual probability") +
    theme(legend.position = "bottom")
}

cal_test_data <- make_cal_test_data(mi_df_test, p = lrm_prob_names,
                                    model_names = lrm_names)
plot_calibration_test(cal_test_data)

It is again useful to visualize the distribution of predicted probabilities since calibration at the tails might be based on a smaller number of patients.

plot_predprobs_hist(cal_test_data)
## Warning: Removed 10 rows containing missing values (geom_bar).

plot_predprobs_cdf(cal_test_data)

8.5 Penalized models

We will run sensitivity analyses on the full model to see if penalization can improve calibration and/or discrimination. The three models that we will consider are (i) logistic regression with a ridge penalty, (ii) logistic regression with a lasso penalty, and (iii) the standard upenalized logistic regression (i.e., from the results presented above).

The penalized logistic regression models can be fit using glmnet.

x_train <- mi_list %>% map(function(data) 
  make_x(data, model_rhs)
)

# Cross validation
lasso_cvfits <- ridge_cvfits <-  vector(mode = "list", length = length(x_train))
for (i in 1:length(lasso_cvfits)){
  ridge_cvfits[[i]] <- cv.glmnet(x = x_train[[i]], y = y[[i]], 
                                 family = "binomial", alpha = 0)
  lasso_cvfits[[i]] <- cv.glmnet(x = x_train[[i]], y = y[[i]], 
                                 family = "binomial", alpha = 1)
}

Now let’s make predictions for the penalized models. The prediction for a given patient is the average prediction across the imputed datasets.

predict_glmnet_mi <- function(fits, newx){
  n_fits <- length(fits)
  pred <- matrix(NA, nrow = nrow(newx), ncol = n_fits)
  for (j in 1:n_fits){
    pred[, j] <- predict(fits[[i]], newx = x_test, 
                         s = "lambda.1se", type = "response")
  }
  return(apply(pred, 1, mean)) 
}

x_test <- make_x(mi_df_test, model_rhs)
mi_df_test$ridge_probs_all <- predict_glmnet_mi(ridge_cvfits, x_test)
mi_df_test$lasso_probs_all <- predict_glmnet_mi(lasso_cvfits, x_test)

With predictions in hand, we can evaluate the model.

penalized_val_test <- vector(mode = "list", length = 2)
names(penalized_val_test) <- c("Lasso", "Ridge")
penalized_prob_names <- paste0(c("lasso", "ridge"), "_probs_all")
for (i in 1:length(penalized_val_test)){
  penalized_val_test[[i]] <- val.prob(p = mi_df_test[[penalized_prob_names[i]]],
                                      y = mi_df_test$died,
                                      pl = FALSE)
}  

We will first assess various evaluation metrics with an emphasis on the Brier score and C-Index.

summarize_val_probs(penalized_val_test)
Model Dxy C (ROC) R2 D D:Chi-sq D:p U U:Chi-sq U:p Q Brier Intercept Slope Emax E90 Eavg S:z S:p
Lasso 0.7300 0.8650 0.3728 0.2355 4038.067 NA 0.0017 30.3799 0 0.2339 0.0913 -0.0786 1.0430 0.1366 0.0218 0.0159 -3.8715 1e-04
Ridge 0.7266 0.8633 0.3415 0.2135 3660.216 NA 0.0197 340.4064 0 0.1937 0.0948 -0.4650 0.9891 0.2006 0.0954 0.0450 -9.1427 0e+00

Finally, we will compare calibration curves for each of the three models.

make_cal_test_data(mi_df_test,
                   p = c(penalized_prob_names, "lrm_probs_all"),
                   model_names = c(names(penalized_val_test), "No penalty")) %>%
  plot_calibration_test()